[clang] [llvm] [Clang][LLVM] Implement multi-single vectors MOP4{A/S} (PR #129226)
Jonathan Thackray via llvm-commits
llvm-commits at lists.llvm.org
Fri Feb 28 03:01:57 PST 2025
https://github.com/jthackray created https://github.com/llvm/llvm-project/pull/129226
Implement all multi-single {BF/F/S/U/SU/US}MOP4{A/S} instructions in clang and
llvm following the acle in https://github.com/ARM-software/acle/pull/381/files.
This PR depends on #128854
>From 87b7d61f23b8aef863d37dcb137603b866ab8c77 Mon Sep 17 00:00:00 2001
From: Virginia Cangelosi <virginia.cangelosi at arm.com>
Date: Tue, 18 Feb 2025 11:02:07 +0000
Subject: [PATCH 01/11] [Clang][LLVM] Implement single-single vectors MOP4{A/S}
---
clang/include/clang/Basic/arm_sme.td | 54 ++
.../sme2-intrinsics/acle_sme2_mop4_1x1.c | 465 ++++++++++++++++++
llvm/include/llvm/IR/IntrinsicsAArch64.td | 51 +-
.../lib/Target/AArch64/AArch64SMEInstrInfo.td | 68 +--
llvm/lib/Target/AArch64/SMEInstrFormats.td | 93 +++-
.../AArch64/sme2-intrinsics-mop4a_1x1.ll | 247 ++++++++++
6 files changed, 903 insertions(+), 75 deletions(-)
create mode 100644 clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_1x1.c
create mode 100644 llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_1x1.ll
diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index 288a8c04c217f..2af29ad6699b6 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -376,6 +376,19 @@ let SMETargetGuard = "sme2" in {
// Outer product and accumulate/subtract
//
+multiclass MOP4SingleSingle<string name, string n, string t, string i, string wide> {
+ def NAME : Inst<"svmop4" # name # "_1x1_" # n # "[_{d}_{d}]", "vidd", t, MergeNone, i # wide # "_1x1", [IsInOutZA, IsStreaming], [ImmCheck<0, ImmCheck0_3>]>;
+}
+
+multiclass MOP4MixedSignsSingleSingle<string n_suffix1, string n_suffix2, string za, string t> {
+ def NAME : SInst<"sv" # n_suffix2 # "_1x1_" # za # "[_{2}_{3}]",
+ "vid" # !cond(!eq(n_suffix1, "su") : "u", true: "x"),
+ !cond(!eq(n_suffix1, "su") : "", true: "U") # t,
+ MergeNone, "aarch64_sme_" # n_suffix2 # "_wide_1x1",
+ [IsStreaming, IsInOutZA],
+ [ImmCheck<0, ImmCheck0_3>]>;
+}
+
let SMETargetGuard = "sme2" in {
def SVSMOPA : Inst<"svmopa_za32[_{d}]_m", "viPPdd", "s", MergeNone, "aarch64_sme_smopa_za32", [IsInOutZA, IsStreaming], [ImmCheck<0, ImmCheck0_3>]>;
def SVUSMOPA : Inst<"svmopa_za32[_{d}]_m", "viPPdd", "Us", MergeNone, "aarch64_sme_umopa_za32", [IsInOutZA, IsStreaming], [ImmCheck<0, ImmCheck0_3>]>;
@@ -387,6 +400,29 @@ let SMETargetGuard = "sme2" in {
def SVBMOPS : Inst<"svbmops_za32[_{d}]_m", "viPPdd", "iUi", MergeNone, "aarch64_sme_bmops_za32", [IsInOutZA, IsStreaming], [ImmCheck<0, ImmCheck0_3>]>;
+ defm SVSMOP4A_MZZ_HtoS : MOP4SingleSingle<"a", "za32", "s", "aarch64_sme_mop4a", "_wide">;
+ defm SVSMOP4S_MZZ_HtoS : MOP4SingleSingle<"s", "za32", "s", "aarch64_sme_mop4s", "_wide">;
+ defm SVSMOP4A_MZZ_BToS : MOP4SingleSingle<"a", "za32", "c", "aarch64_sme_mop4a", "_wide">;
+ defm SVSMOP4S_MZZ_BToS : MOP4SingleSingle<"s", "za32", "c", "aarch64_sme_mop4s", "_wide">;
+
+ defm SVUMOP4A_MZZ_HtoS : MOP4SingleSingle<"a", "za32", "Us", "aarch64_sme_mop4a", "_wide">;
+ defm SVUMOP4S_MZZ_HtoS : MOP4SingleSingle<"s", "za32", "Us", "aarch64_sme_mop4s", "_wide">;
+ defm SVUMOP4A_MZZ_BToS : MOP4SingleSingle<"a", "za32", "Uc", "aarch64_sme_mop4a", "_wide">;
+ defm SVUMOP4S_MZZ_BToS : MOP4SingleSingle<"s", "za32", "Uc", "aarch64_sme_mop4s", "_wide">;
+
+ defm SVFMOP4A_MZZ_HtoS : MOP4SingleSingle<"a", "za32", "h", "aarch64_sme_mop4a", "_wide">;
+ defm SVFMOP4S_MZZ_HtoS : MOP4SingleSingle<"s", "za32", "h", "aarch64_sme_mop4s", "_wide">;
+ defm SVFMOP4A_MZZ_S : MOP4SingleSingle<"a", "za32", "f", "aarch64_sme_mop4a", "">;
+ defm SVFMOP4S_MZZ_S : MOP4SingleSingle<"s", "za32", "f", "aarch64_sme_mop4s", "">;
+
+ defm SVBMOP4A_MZZ_S : MOP4SingleSingle<"a", "za32", "b", "aarch64_sme_mop4a", "_wide">;
+ defm SVBMOP4S_MZZ_S : MOP4SingleSingle<"s", "za32", "b", "aarch64_sme_mop4s", "_wide">;
+
+ defm SVSUMOP4A_MZZ_BtoS : MOP4MixedSignsSingleSingle<"su", "mop4a", "za32", "c">;
+ defm SVUSMOP4A_MZZ_BtoS : MOP4MixedSignsSingleSingle<"us", "mop4a", "za32", "c">;
+ defm SVSUMOP4S_MZZ_BtoS : MOP4MixedSignsSingleSingle<"su", "mop4s", "za32", "c">;
+ defm SVUSMOP4S_MZZ_BtoS : MOP4MixedSignsSingleSingle<"us", "mop4s", "za32", "c">;
+
// VERTICAL DOT-PRODUCT
def SVVDOT_LANE_ZA32_VG1x2_S : Inst<"svvdot_lane_za32[_{d}]_vg1x2", "vm2di", "s", MergeNone, "aarch64_sme_svdot_lane_za32_vg1x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_3>]>;
def SVVDOT_LANE_ZA32_VG1x4_S : Inst<"svvdot_lane_za32[_{d}]_vg1x4", "vm4di", "c", MergeNone, "aarch64_sme_svdot_lane_za32_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_3>]>;
@@ -437,6 +473,15 @@ let SMETargetGuard = "sme2" in {
}
let SMETargetGuard = "sme2,sme-i16i64" in {
+ defm SVSMOP4A_MZZ_HtoD : MOP4SingleSingle<"a", "za64", "s", "aarch64_sme_mop4a", "_wide">;
+ defm SVSMOP4S_MZZ_HtoD : MOP4SingleSingle<"s", "za64", "s", "aarch64_sme_mop4s", "_wide">;
+ defm SVUMOP4A_MZZ_HtoD : MOP4SingleSingle<"a", "za64", "Us", "aarch64_sme_mop4a", "_wide">;
+ defm SVUMOP4S_MZZ_HtoD : MOP4SingleSingle<"s", "za64", "Us", "aarch64_sme_mop4s", "_wide">;
+ defm SVSUMOP4A_MZZ_HtoD : MOP4MixedSignsSingleSingle<"su", "mop4a", "za64", "s">;
+ defm SVUSMOP4A_MZZ_HtoD : MOP4MixedSignsSingleSingle<"us", "mop4a", "za64", "s">;
+ defm SVSUMOP4S_MZZ_HtoD : MOP4MixedSignsSingleSingle<"su", "mop4s", "za64", "s">;
+ defm SVUSMOP4S_MZZ_HtoD : MOP4MixedSignsSingleSingle<"us", "mop4s", "za64", "s">;
+
def SVVDOT_LANE_ZA64_VG1x4_S : Inst<"svvdot_lane_za64[_{d}]_vg1x4", "vm4di", "s", MergeNone, "aarch64_sme_svdot_lane_za64_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_1>]>;
def SVVDOT_LANE_ZA64_VG1x4_U : Inst<"svvdot_lane_za64[_{d}]_vg1x4", "vm4di", "Us", MergeNone, "aarch64_sme_uvdot_lane_za64_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_1>]>;
@@ -473,6 +518,9 @@ let SMETargetGuard = "sme2" in {
}
let SMETargetGuard = "sme2,sme-f64f64" in {
+ defm SVFMOP4A_MZZ_D : MOP4SingleSingle<"a", "za64", "d", "aarch64_sme_mop4a", "">;
+ defm SVFMOP4S_MZZ_D : MOP4SingleSingle<"s", "za64", "d", "aarch64_sme_mop4s", "">;
+
def SVMLA_MULTI_VG1x2_F64 : Inst<"svmla_za64[_{d}]_vg1x2", "vm22", "d", MergeNone, "aarch64_sme_fmla_vg1x2", [IsStreaming, IsInOutZA], []>;
def SVMLA_MULTI_VG1x4_F64 : Inst<"svmla_za64[_{d}]_vg1x4", "vm44", "d", MergeNone, "aarch64_sme_fmla_vg1x4", [IsStreaming, IsInOutZA], []>;
def SVMLS_MULTI_VG1x2_F64 : Inst<"svmls_za64[_{d}]_vg1x2", "vm22", "d", MergeNone, "aarch64_sme_fmls_vg1x2", [IsStreaming, IsInOutZA], []>;
@@ -490,6 +538,9 @@ let SMETargetGuard = "sme2,sme-f64f64" in {
}
let SMETargetGuard = "sme-f16f16" in {
+ defm SVFMOP4A_MZZ_H : MOP4SingleSingle<"a", "za16", "h", "aarch64_sme_mop4a", "">;
+ defm SVFMOP4S_MZZ_H : MOP4SingleSingle<"s", "za16", "h", "aarch64_sme_mop4s", "">;
+
def SVMLA_MULTI_VG1x2_F16 : Inst<"svmla_za16[_f16]_vg1x2", "vm22", "h", MergeNone, "aarch64_sme_fmla_vg1x2", [IsStreaming, IsInOutZA], []>;
def SVMLA_MULTI_VG1x4_F16 : Inst<"svmla_za16[_f16]_vg1x4", "vm44", "h", MergeNone, "aarch64_sme_fmla_vg1x4", [IsStreaming, IsInOutZA], []>;
def SVMLS_MULTI_VG1x2_F16 : Inst<"svmls_za16[_f16]_vg1x2", "vm22", "h", MergeNone, "aarch64_sme_fmls_vg1x2", [IsStreaming, IsInOutZA], []>;
@@ -507,6 +558,9 @@ let SMETargetGuard = "sme-f16f16" in {
}
let SMETargetGuard = "sme-b16b16" in {
+ defm SVBMOP4A_MZZ_H : MOP4SingleSingle<"a", "za16", "bf", "aarch64_sme_mop4a", "">;
+ defm SVBMOP4S_MZZ_H : MOP4SingleSingle<"s", "za16", "bf", "aarch64_sme_mop4s", "">;
+
def SVMLA_MULTI_VG1x2_BF16 : Inst<"svmla_za16[_bf16]_vg1x2", "vm22", "b", MergeNone, "aarch64_sme_fmla_vg1x2", [IsStreaming, IsInOutZA], []>;
def SVMLA_MULTI_VG1x4_BF16 : Inst<"svmla_za16[_bf16]_vg1x4", "vm44", "b", MergeNone, "aarch64_sme_fmla_vg1x4", [IsStreaming, IsInOutZA], []>;
def SVMLS_MULTI_VG1x2_BF16 : Inst<"svmls_za16[_bf16]_vg1x2", "vm22", "b", MergeNone, "aarch64_sme_fmls_vg1x2", [IsStreaming, IsInOutZA], []>;
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_1x1.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_1x1.c
new file mode 100644
index 0000000000000..37238053009fd
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_1x1.c
@@ -0,0 +1,465 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+
+#include <arm_sme.h>
+
+#ifdef SME_OVERLOADED_FORMS
+#define SME_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
+#else
+#define SME_ACLE_FUNC(A1,A2,A3) A1##A2##A3
+#endif
+
+// CHECK-LABEL: @test_svmop4a_1x1_za32_s8_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z27test_svmop4a_1x1_za32_s8_s8u10__SVInt8_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_1x1_za32_s8_s8(svint8_t zn, svint8_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_1x1_za32,_s8_s8,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_1x1_za32_s8_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z27test_svmop4s_1x1_za32_s8_s8u10__SVInt8_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4s_1x1_za32_s8_s8(svint8_t zn, svint8_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s_1x1_za32,_s8_s8,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_1x1_za32_u8_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z27test_svmop4a_1x1_za32_u8_u8u11__SVUint8_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_1x1_za32_u8_u8(svuint8_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_1x1_za32,_u8_u8,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_1x1_za32_u8_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z27test_svmop4s_1x1_za32_u8_u8u11__SVUint8_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4s_1x1_za32_u8_u8(svuint8_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s_1x1_za32,_u8_u8,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_1x1_za32_s16_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x1_za32_s16_s16u11__SVInt16_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_1x1_za32_s16_s16(svint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_1x1_za32,_s16_s16,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_1x1_za32_s16_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x1_za32_s16_s16u11__SVInt16_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4s_1x1_za32_s16_s16(svint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s_1x1_za32,_s16_s16,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_1x1_za32_u16_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x1_za32_u16_u16u12__SVUint16_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_1x1_za32_u16_u16(svuint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_1x1_za32,_u16_u16,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_1x1_za32_u16_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x1_za32_u16_u16u12__SVUint16_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4s_1x1_za32_u16_u16(svuint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s_1x1_za32,_u16_u16,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_1x1_za32_f16_f16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8f16(i32 3, <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x1_za32_f16_f16u13__SVFloat16_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8f16(i32 3, <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_1x1_za32_f16_f16(svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_1x1_za32,_f16_f16,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_1x1_za32_f16_f16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8f16(i32 3, <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x1_za32_f16_f16u13__SVFloat16_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8f16(i32 3, <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4s_1x1_za32_f16_f16(svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s_1x1_za32,_f16_f16,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_1x1_za32_bf16_bf16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8bf16(i32 3, <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svmop4a_1x1_za32_bf16_bf16u14__SVBfloat16_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8bf16(i32 3, <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_1x1_za32_bf16_bf16(svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_1x1_za32,_bf16_bf16,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_1x1_za32_bf16_bf16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8bf16(i32 3, <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svmop4s_1x1_za32_bf16_bf16u14__SVBfloat16_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8bf16(i32 3, <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4s_1x1_za32_bf16_bf16(svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s_1x1_za32,_bf16_bf16,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_1x1_za64_s16_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x1_za64_s16_s16u11__SVInt16_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_1x1_za64_s16_s16(svint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_1x1_za32,_s16_s16,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_1x1_za64_s16_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x1_za64_s16_s16u11__SVInt16_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4s_1x1_za64_s16_s16(svint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s_1x1_za32,_s16_s16,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_1x1_za64_u16_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x1_za64_u16_u16u12__SVUint16_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_1x1_za64_u16_u16(svuint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_1x1_za64,_u16_u16,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_1x1_za64_u16_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x1_za64_u16_u16u12__SVUint16_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4s_1x1_za64_u16_u16(svuint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s_1x1_za64,_u16_u16,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_1x1_za64_s16_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x1_za64_s16_u16u11__SVInt16_tu12__SVUint16_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_1x1_za64_s16_u16(svint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_1x1_za64,_s16_u16,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_1x1_za64_s16_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x1_za64_s16_u16u11__SVInt16_tu12__SVUint16_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4s_1x1_za64_s16_u16(svint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s_1x1_za64,_s16_u16,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_1x1_za64_u16_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x1_za64_u16_s16u12__SVUint16_tu11__SVInt16_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_1x1_za64_u16_s16(svuint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_1x1_za64,_u16_s16,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_1x1_za64_u16_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x1_za64_u16_s16u12__SVUint16_tu11__SVInt16_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4s_1x1_za64_u16_s16(svuint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s_1x1_za64,_u16_s16,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_1x1_za32_s8_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z27test_svmop4a_1x1_za32_s8_u8u10__SVInt8_tu11__SVUint8_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_1x1_za32_s8_u8(svint8_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_1x1_za32,_s8_u8,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_1x1_za32_s8_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z27test_svmop4s_1x1_za32_s8_u8u10__SVInt8_tu11__SVUint8_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4s_1x1_za32_s8_u8(svint8_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s_1x1_za32,_s8_u8,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_1x1_za32_u8_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z27test_svmop4a_1x1_za32_u8_s8u11__SVUint8_tu10__SVInt8_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_1x1_za32_u8_s8(svuint8_t zn, svint8_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_1x1_za32,_u8_s8,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_1x1_za32_u8_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z27test_svmop4s_1x1_za32_u8_s8u11__SVUint8_tu10__SVInt8_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4s_1x1_za32_u8_s8(svuint8_t zn, svint8_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s_1x1_za32,_u8_s8,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_1x1_za16_f16_f16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x1.nxv8f16(i32 3, <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x1_za16_f16_f16u13__SVFloat16_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x1.nxv8f16(i32 3, <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_1x1_za16_f16_f16(svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_1x1_za16,_f16_f16,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_1x1_za16_f16_f16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x1.nxv8f16(i32 3, <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x1_za16_f16_f16u13__SVFloat16_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x1.nxv8f16(i32 3, <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4s_1x1_za16_f16_f16(svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s_1x1_za16,_f16_f16,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_1x1_za32_f32_f32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x1.nxv4f32(i32 3, <vscale x 4 x float> [[ZN:%.*]], <vscale x 4 x float> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x1_za32_f32_f32u13__SVFloat32_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x1.nxv4f32(i32 3, <vscale x 4 x float> [[ZN:%.*]], <vscale x 4 x float> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_1x1_za32_f32_f32(svfloat32_t zn, svfloat32_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_1x1_za32,_f32_f32,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_1x1_za32_f32_f32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x1.nxv4f32(i32 3, <vscale x 4 x float> [[ZN:%.*]], <vscale x 4 x float> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x1_za32_f32_f32u13__SVFloat32_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x1.nxv4f32(i32 3, <vscale x 4 x float> [[ZN:%.*]], <vscale x 4 x float> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4s_1x1_za32_f32_f32(svfloat32_t zn, svfloat32_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s_1x1_za32,_f32_f32,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_1x1_za64_f64_f64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x1.nxv2f64(i32 3, <vscale x 2 x double> [[ZN:%.*]], <vscale x 2 x double> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x1_za64_f64_f64u13__SVFloat64_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x1.nxv2f64(i32 3, <vscale x 2 x double> [[ZN:%.*]], <vscale x 2 x double> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_1x1_za64_f64_f64(svfloat64_t zn, svfloat64_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_1x1_za64,_f64_f64,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_1x1_za64_f64_f64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x1.nxv2f64(i32 3, <vscale x 2 x double> [[ZN:%.*]], <vscale x 2 x double> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x1_za64_f64_f64u13__SVFloat64_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x1.nxv2f64(i32 3, <vscale x 2 x double> [[ZN:%.*]], <vscale x 2 x double> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4s_1x1_za64_f64_f64(svfloat64_t zn, svfloat64_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s_1x1_za64,_f64_f64,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_1x1_za16_bf16_bf16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x1.nxv8bf16(i32 3, <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svmop4a_1x1_za16_bf16_bf16u14__SVBfloat16_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x1.nxv8bf16(i32 3, <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_1x1_za16_bf16_bf16(svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_1x1_za16,_bf16_bf16,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_1x1_za16_bf16_bf16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x1.nxv8bf16(i32 3, <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svmop4s_1x1_za16_bf16_bf16u14__SVBfloat16_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x1.nxv8bf16(i32 3, <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4s_1x1_za16_bf16_bf16(svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s_1x1_za16,_bf16_bf16,)(3, zn, zm);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 6dfc3c8f2a393..0714602a2f09b 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -1497,7 +1497,7 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.".
LLVMSubdivide2VectorType<0>,
llvm_i32_ty],
[IntrNoMem, ImmArg<ArgIndex<3>>]>;
-
+
class SVE2_1VectorArgIndexed_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty],
[LLVMMatchType<0>,
@@ -1512,7 +1512,7 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.".
llvm_i32_ty,
llvm_i32_ty],
[IntrNoMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
-
+
class SVE2_1VectorArg_Pred_Intrinsic
: DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
[llvm_anyvector_ty],
@@ -1522,7 +1522,7 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.".
: DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
[llvm_anyvector_ty, llvm_i32_ty],
[IntrNoMem, ImmArg<ArgIndex<1>>]>;
-
+
class SVE2_Pred_1VectorArgIndexed_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty],
[LLVMMatchType<0>,
@@ -3064,6 +3064,17 @@ let TargetPrefix = "aarch64" in {
def int_aarch64_sme_usmopa_wide : SME_OuterProduct_Intrinsic;
def int_aarch64_sme_usmops_wide : SME_OuterProduct_Intrinsic;
+ class SME_OuterProduct_QuaterTile
+ : DefaultAttrsIntrinsic<[],
+ [llvm_i32_ty,
+ llvm_anyvector_ty,
+ LLVMMatchType<0>], [ImmArg<ArgIndex<0>>]>;
+
+ def int_aarch64_sme_mop4a_wide_1x1 : SME_OuterProduct_QuaterTile;
+ def int_aarch64_sme_mop4s_wide_1x1 : SME_OuterProduct_QuaterTile;
+ def int_aarch64_sme_mop4a_1x1 : SME_OuterProduct_QuaterTile;
+ def int_aarch64_sme_mop4s_1x1 : SME_OuterProduct_QuaterTile;
+
class SME_AddVectorToTile_Intrinsic
: DefaultAttrsIntrinsic<[],
[llvm_i32_ty,
@@ -3319,11 +3330,11 @@ let TargetPrefix = "aarch64" in {
: DefaultAttrsIntrinsic<[llvm_nxv8bf16_ty],
[llvm_nxv4f32_ty, llvm_nxv4f32_ty],
[IntrNoMem]>;
-
+
class SME2_CVT_WIDENING_VG2_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
[LLVMSubdivide2VectorType<0>], [IntrNoMem]>;
-
+
class SME2_CVT_VG4_SINGLE_Intrinsic
: DefaultAttrsIntrinsic<[LLVMSubdivide4VectorType<0>],
@@ -3564,7 +3575,7 @@ let TargetPrefix = "aarch64" in {
foreach vg = ["vg1x2", "vg1x4", "vg2x1", "vg2x2", "vg2x4", "vg4x1", "vg4x2", "vg4x4"] in {
def int_aarch64_sme_zero_za64_ # vg : DefaultAttrsIntrinsic<[], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects]>;
}
-
+
// Multi-vector signed saturating doubling multiply high
def int_aarch64_sve_sqdmulh_single_vgx2 : SME2_VG2_Multi_Single_Intrinsic;
@@ -3634,7 +3645,7 @@ let TargetPrefix = "aarch64" in {
//
//Multi-vector floating-point convert from half-precision to deinterleaved single-precision.
//
-
+
def int_aarch64_sve_fcvtl_widen_x2 : SME2_CVT_WIDENING_VG2_Intrinsic;
//
@@ -3826,7 +3837,7 @@ let TargetPrefix = "aarch64" in {
def int_aarch64_sme_luti4_lane_zt
: DefaultAttrsIntrinsic<[llvm_anyvector_ty], [llvm_i32_ty, llvm_nxv16i8_ty, llvm_i32_ty],
[ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, IntrReadMem]>;
-
+
// Lookup table expand two registers
//
def int_aarch64_sme_luti2_lane_zt_x2
@@ -3835,7 +3846,7 @@ let TargetPrefix = "aarch64" in {
def int_aarch64_sme_luti4_lane_zt_x2
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], [llvm_i32_ty, llvm_nxv16i8_ty, llvm_i32_ty],
[ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, IntrReadMem]>;
-
+
//
// Lookup table expand four registers
//
@@ -3853,7 +3864,7 @@ let TargetPrefix = "aarch64" in {
[llvm_i32_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty],
[ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>;
-
+
//
// Register scaling
//
@@ -3901,7 +3912,7 @@ def int_aarch64_sve_extq : AdvSIMD_2VectorArgIndexed_Intrinsic;
//
// SVE2.1 - Move predicate to/from vector
//
-def int_aarch64_sve_pmov_to_pred_lane : SVE2_1VectorArgIndexed_Pred_Intrinsic;
+def int_aarch64_sve_pmov_to_pred_lane : SVE2_1VectorArgIndexed_Pred_Intrinsic;
def int_aarch64_sve_pmov_to_pred_lane_zero : SVE2_1VectorArg_Pred_Intrinsic;
@@ -3943,10 +3954,10 @@ let TargetPrefix = "aarch64" in {
: DefaultAttrsIntrinsic<[llvm_nxv16i8_ty],
[llvm_anyvector_ty, LLVMMatchType<0>],
[IntrReadMem, IntrInaccessibleMemOnly]>;
-
+
def int_aarch64_sve_fp8_cvtn : SVE2_FP8_Narrow_Cvt;
def int_aarch64_sve_fp8_cvtnb : SVE2_FP8_Narrow_Cvt;
-
+
def int_aarch64_sve_fp8_cvtnt
: DefaultAttrsIntrinsic<[llvm_nxv16i8_ty],
[llvm_nxv16i8_ty, llvm_anyvector_ty, LLVMMatchType<0>],
@@ -3958,32 +3969,32 @@ let TargetPrefix = "aarch64" in {
[LLVMMatchType<0>,
llvm_nxv16i8_ty, llvm_nxv16i8_ty],
[IntrReadMem, IntrInaccessibleMemOnly]>;
-
+
class SVE2_FP8_FMLA_FDOT_Lane
: DefaultAttrsIntrinsic<[llvm_anyvector_ty],
[LLVMMatchType<0>,
llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_i32_ty],
[IntrReadMem, IntrInaccessibleMemOnly, ImmArg<ArgIndex<3>>]>;
-
+
def int_aarch64_sve_fp8_fdot : SVE2_FP8_FMLA_FDOT;
def int_aarch64_sve_fp8_fdot_lane : SVE2_FP8_FMLA_FDOT_Lane;
// Fused multiply-add
def int_aarch64_sve_fp8_fmlalb : SVE2_FP8_FMLA_FDOT;
def int_aarch64_sve_fp8_fmlalb_lane : SVE2_FP8_FMLA_FDOT_Lane;
-
+
def int_aarch64_sve_fp8_fmlalt : SVE2_FP8_FMLA_FDOT;
def int_aarch64_sve_fp8_fmlalt_lane : SVE2_FP8_FMLA_FDOT_Lane;
-
+
def int_aarch64_sve_fp8_fmlallbb : SVE2_FP8_FMLA_FDOT;
def int_aarch64_sve_fp8_fmlallbb_lane : SVE2_FP8_FMLA_FDOT_Lane;
-
+
def int_aarch64_sve_fp8_fmlallbt : SVE2_FP8_FMLA_FDOT;
def int_aarch64_sve_fp8_fmlallbt_lane : SVE2_FP8_FMLA_FDOT_Lane;
-
+
def int_aarch64_sve_fp8_fmlalltb : SVE2_FP8_FMLA_FDOT;
def int_aarch64_sve_fp8_fmlalltb_lane : SVE2_FP8_FMLA_FDOT_Lane;
-
+
def int_aarch64_sve_fp8_fmlalltt : SVE2_FP8_FMLA_FDOT;
def int_aarch64_sve_fp8_fmlalltt_lane : SVE2_FP8_FMLA_FDOT_Lane;
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index d2aa86f388db2..0673394d4daa9 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -148,30 +148,30 @@ defm USMOPS_MPPZZ_D : sme_int_outer_product_i64<0b101, "usmops", int_aarch64_sme
}
let Predicates = [HasSME_MOP4] in {
- defm SMOP4A : sme_quarter_outer_product_i8_i32<0b0, 0b0, 0b0, "smop4a">;
- defm SMOP4S : sme_quarter_outer_product_i8_i32<0b0, 0b0, 0b1, "smop4s">;
- defm SUMOP4A : sme_quarter_outer_product_i8_i32<0b0, 0b1, 0b0, "sumop4a">;
- defm SUMOP4S : sme_quarter_outer_product_i8_i32<0b0, 0b1, 0b1, "sumop4s">;
- defm USMOP4A : sme_quarter_outer_product_i8_i32<0b1, 0b0, 0b0, "usmop4a">;
- defm USMOP4S : sme_quarter_outer_product_i8_i32<0b1, 0b0, 0b1, "usmop4s">;
- defm UMOP4A : sme_quarter_outer_product_i8_i32<0b1, 0b1, 0b0, "umop4a">;
- defm UMOP4S : sme_quarter_outer_product_i8_i32<0b1, 0b1, 0b1, "umop4s">;
-
- defm SMOP4A : sme_quarter_outer_product_i16_i32<0b0, 0b0, "smop4a">;
- defm SMOP4S : sme_quarter_outer_product_i16_i32<0b0, 0b1, "smop4s">;
- defm UMOP4A : sme_quarter_outer_product_i16_i32<0b1, 0b0, "umop4a">;
- defm UMOP4S : sme_quarter_outer_product_i16_i32<0b1, 0b1, "umop4s">;
+ defm SMOP4A : sme_quarter_outer_product_i8_i32<0b0, 0b0, 0b0, "smop4a", int_aarch64_sme_mop4a_wide_1x1>;
+ defm SMOP4S : sme_quarter_outer_product_i8_i32<0b0, 0b0, 0b1, "smop4s", int_aarch64_sme_mop4s_wide_1x1>;
+ defm SUMOP4A : sme_quarter_outer_product_i8_i32<0b0, 0b1, 0b0, "sumop4a", int_aarch64_sme_mop4a_wide_1x1>;
+ defm SUMOP4S : sme_quarter_outer_product_i8_i32<0b0, 0b1, 0b1, "sumop4s", int_aarch64_sme_mop4a_wide_1x1>;
+ defm USMOP4A : sme_quarter_outer_product_i8_i32<0b1, 0b0, 0b0, "usmop4a", int_aarch64_sme_mop4a_wide_1x1>;
+ defm USMOP4S : sme_quarter_outer_product_i8_i32<0b1, 0b0, 0b1, "usmop4s", int_aarch64_sme_mop4a_wide_1x1>;
+ defm UMOP4A : sme_quarter_outer_product_i8_i32<0b1, 0b1, 0b0, "umop4a", int_aarch64_sme_mop4a_wide_1x1>;
+ defm UMOP4S : sme_quarter_outer_product_i8_i32<0b1, 0b1, 0b1, "umop4s", int_aarch64_sme_mop4s_wide_1x1>;
+
+ defm SMOP4A : sme_quarter_outer_product_i16_i32<0b0, 0b0, "smop4a", int_aarch64_sme_mop4a_wide_1x1>;
+ defm SMOP4S : sme_quarter_outer_product_i16_i32<0b0, 0b1, "smop4s", int_aarch64_sme_mop4s_wide_1x1>;
+ defm UMOP4A : sme_quarter_outer_product_i16_i32<0b1, 0b0, "umop4a", int_aarch64_sme_mop4a_wide_1x1>;
+ defm UMOP4S : sme_quarter_outer_product_i16_i32<0b1, 0b1, "umop4s", int_aarch64_sme_mop4s_wide_1x1>;
}
let Predicates = [HasSME_MOP4, HasSMEI16I64] in {
- defm SMOP4A : sme_quarter_outer_product_i64<0b0, 0b0, 0b0, "smop4a">;
- defm SMOP4S : sme_quarter_outer_product_i64<0b0, 0b0, 0b1, "smop4s">;
- defm SUMOP4A : sme_quarter_outer_product_i64<0b0, 0b1, 0b0, "sumop4a">;
- defm SUMOP4S : sme_quarter_outer_product_i64<0b0, 0b1, 0b1, "sumop4s">;
- defm UMOP4A : sme_quarter_outer_product_i64<0b1, 0b1, 0b0, "umop4a">;
- defm UMOP4S : sme_quarter_outer_product_i64<0b1, 0b1, 0b1, "umop4s">;
- defm USMOP4A : sme_quarter_outer_product_i64<0b1, 0b0, 0b0, "usmop4a">;
- defm USMOP4S : sme_quarter_outer_product_i64<0b1, 0b0, 0b1, "usmop4s">;
+ defm SMOP4A : sme_quarter_outer_product_i64<0b0, 0b0, 0b0, "smop4a", int_aarch64_sme_mop4a_wide_1x1>;
+ defm SMOP4S : sme_quarter_outer_product_i64<0b0, 0b0, 0b1, "smop4s", int_aarch64_sme_mop4s_wide_1x1>;
+ defm SUMOP4A : sme_quarter_outer_product_i64<0b0, 0b1, 0b0, "sumop4a", int_aarch64_sme_mop4a_wide_1x1>;
+ defm SUMOP4S : sme_quarter_outer_product_i64<0b0, 0b1, 0b1, "sumop4s", int_aarch64_sme_mop4s_wide_1x1>;
+ defm UMOP4A : sme_quarter_outer_product_i64<0b1, 0b1, 0b0, "umop4a", int_aarch64_sme_mop4a_wide_1x1>;
+ defm UMOP4S : sme_quarter_outer_product_i64<0b1, 0b1, 0b1, "umop4s", int_aarch64_sme_mop4s_wide_1x1>;
+ defm USMOP4A : sme_quarter_outer_product_i64<0b1, 0b0, 0b0, "usmop4a", int_aarch64_sme_mop4a_wide_1x1>;
+ defm USMOP4S : sme_quarter_outer_product_i64<0b1, 0b0, 0b1, "usmop4s", int_aarch64_sme_mop4s_wide_1x1>;
}
let Predicates = [HasSME_TMOP] in {
@@ -1054,14 +1054,14 @@ let Predicates = [HasSME2, HasSVEBFSCALE] in {
}
let Predicates = [HasSME_MOP4] in {
- defm BFMOP4A : sme2_bfmop4as_widening<0, "bfmop4a">;
- defm BFMOP4S : sme2_bfmop4as_widening<1, "bfmop4s">;
+ defm BFMOP4A : sme2_bfmop4as_widening<0, "bfmop4a", int_aarch64_sme_mop4a_wide_1x1>;
+ defm BFMOP4S : sme2_bfmop4as_widening<1, "bfmop4s", int_aarch64_sme_mop4s_wide_1x1>;
- defm FMOP4A : sme2_fmop4as_fp16_fp32_widening<0, "fmop4a">;
- defm FMOP4S : sme2_fmop4as_fp16_fp32_widening<1, "fmop4s">;
+ defm FMOP4A : sme2_fmop4as_fp16_fp32_widening<0, "fmop4a", int_aarch64_sme_mop4a_wide_1x1>;
+ defm FMOP4S : sme2_fmop4as_fp16_fp32_widening<1, "fmop4s", int_aarch64_sme_mop4s_wide_1x1>;
- defm FMOP4A : sme2_fmop4as_fp32_non_widening<0, "fmop4a">;
- defm FMOP4S : sme2_fmop4as_fp32_non_widening<1, "fmop4s">;
+ defm FMOP4A : sme2_fmop4as_fp32_non_widening<0, "fmop4a", int_aarch64_sme_mop4a_1x1>;
+ defm FMOP4S : sme2_fmop4as_fp32_non_widening<1, "fmop4s", int_aarch64_sme_mop4s_1x1>;
}
let Predicates = [HasSME_TMOP] in {
@@ -1084,7 +1084,7 @@ let Predicates = [HasSME_TMOP, HasSMEB16B16] in {
let Predicates = [HasSME_TMOP, HasSMEF8F32], Uses = [FPMR, FPCR] in {
def FTMOPA_M2ZZZI_BtoS : sme_tmopa_32b<0b01000, ZZ_b_mul_r, ZPR8, "ftmopa">;
-}
+}
let Predicates = [HasSME_TMOP, HasSMEF8F16], Uses = [FPMR, FPCR] in {
def FTMOPA_M2ZZZI_BtoH : sme_tmopa_16b<0b01001, ZZ_b_mul_r, ZPR8, "ftmopa">;
@@ -1099,8 +1099,8 @@ let Predicates = [HasSME_TMOP, HasSMEF16F16] in {
}
let Predicates = [HasSME_MOP4, HasSMEF16F16] in {
- defm FMOP4A : sme2_fmop4as_fp16_non_widening<0, "fmop4a">;
- defm FMOP4S : sme2_fmop4as_fp16_non_widening<1, "fmop4s">;
+ defm FMOP4A : sme2_fmop4as_fp16_non_widening<0, "fmop4a", int_aarch64_sme_mop4a_1x1>;
+ defm FMOP4S : sme2_fmop4as_fp16_non_widening<1, "fmop4s", int_aarch64_sme_mop4s_1x1>;
}
let Predicates = [HasSME2, HasSVEBFSCALE] in {
@@ -1115,11 +1115,11 @@ let Predicates = [HasSME_MOP4, HasSMEF8F32] in {
}
let Predicates = [HasSME_MOP4, HasSMEB16B16] in {
- defm BFMOP4A : sme2_bfmop4as_non_widening<0, "bfmop4a">;
- defm BFMOP4S : sme2_bfmop4as_non_widening<1, "bfmop4s">;
+ defm BFMOP4A : sme2_bfmop4as_non_widening<0, "bfmop4a", int_aarch64_sme_mop4a_1x1>;
+ defm BFMOP4S : sme2_bfmop4as_non_widening<1, "bfmop4s", int_aarch64_sme_mop4s_1x1>;
}
let Predicates = [HasSME_MOP4, HasSMEF64F64] in {
- defm FMOP4A : sme2_fmop4as_fp64_non_widening<0, "fmop4a">;
- defm FMOP4S : sme2_fmop4as_fp64_non_widening<1, "fmop4s">;
+ defm FMOP4A : sme2_fmop4as_fp64_non_widening<0, "fmop4a", int_aarch64_sme_mop4a_1x1>;
+ defm FMOP4S : sme2_fmop4as_fp64_non_widening<1, "fmop4s", int_aarch64_sme_mop4s_1x1>;
}
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index 4f6a413ba5e5c..5a3d12e9f7b8b 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -104,6 +104,15 @@ class sme_outer_product_pseudo<ZPRRegOp zpr_ty, SMEMatrixTypeEnum za_flag>
let usesCustomInserter = 1;
}
+class sme2_quarter_tile_outer_product_pseudo<ZPRRegOp zn_ty, ZPRRegOp zm_ty, SMEMatrixTypeEnum za_flag>
+ : Pseudo<(outs), (ins i32imm:$tile,
+ zn_ty:$zn, zm_ty:$zm), []>,
+ Sched<[]> {
+ // Translated to the actual instructions in AArch64ISelLowering.cpp
+ let SMEMatrixType = za_flag;
+ let usesCustomInserter = 1;
+}
+
class sme2_za_array_2op_multi_single_pseudo<string name, Operand index_ty, RegisterOperand multi_vector_ty,
ZPRRegOp zpr_ty, SMEMatrixTypeEnum za_flag>
: SMEPseudo2Instr<name, 0>,
@@ -251,12 +260,15 @@ class SME2_Tile_VG4_Multi_Pat<string name, SDPatternOperator intrinsic, Operand
class SME2_Zero_Matrix_Pat<string name, SDPatternOperator intrinsic, Operand offset_ty, ComplexPattern tileslice>
: Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, offset_ty:$offset))),
- (!cast<Instruction>(name) $base, $offset)>;
+ (!cast<Instruction>(name) $base, $offset)>;
class SME2_Tile_Movaz_Pat<string name, SDPatternOperator intrinsic, ValueType out_vt, Operand tile_imm, Operand index_ty, ComplexPattern tileslice>
: Pat<(out_vt (intrinsic tile_imm:$tile, (i32 (tileslice MatrixIndexGPR32Op12_15:$base, index_ty:$offset)))),
(!cast<Instruction>(name # _PSEUDO) $tile, $base, $offset)>;
+class SME2_ZA_Tile_TwoVec_Pat<string name, SDPatternOperator intrinsic, Operand imm_ty, ValueType vt>
+ : Pat<(intrinsic imm_ty:$tile, vt:$Zn, vt:$Zm),
+ (!cast<Instruction>(name # _PSEUDO) $tile, $Zn, $Zm)>;
//===----------------------------------------------------------------------===//
// SME pattern match helpers.
@@ -600,9 +612,14 @@ class sme_quarter_outer_product_i16_i32<bit u0, bit N, bit M, bit subtr, Registe
let Constraints = "$ZAda = $_ZAda";
}
-multiclass sme_quarter_outer_product_i8_i32<bit zn_u, bit zm_u, bit subtr, string mnemonic>{
+multiclass sme_quarter_outer_product_i8_i32<bit zn_u, bit zm_u, bit subtr, string mnemonic, SDPatternOperator op>{
def _MZZ_BToS : sme_quarter_outer_product_i8_i32<{zn_u, 0}, {zm_u, 0}, subtr,
- ZPR8Mul2_Lo, ZPR8Mul2_Hi, mnemonic>;
+ ZPR8Mul2_Lo, ZPR8Mul2_Hi, mnemonic>, SMEPseudo2Instr<NAME # _MZZ_BToS, 1>;
+
+ def NAME # _MZZ_BToS # _PSEUDO : sme2_quarter_tile_outer_product_pseudo<ZPR8Mul2_Lo, ZPR8Mul2_Hi, SMEMatrixTileS>, SMEPseudo2Instr<NAME # _MZZ_BToS, 0>;
+
+ def : SME2_ZA_Tile_TwoVec_Pat<NAME # _MZZ_BToS, op, timm32_0_3, nxv16i8>;
+
def _M2ZZ_BToS : sme_quarter_outer_product_i8_i32<{zn_u, 1}, {zm_u, 0}, subtr,
ZZ_b_mul_r_Lo, ZPR8Mul2_Hi, mnemonic>;
def _MZ2Z_BToS : sme_quarter_outer_product_i8_i32<{zn_u, 0}, {zm_u, 1}, subtr,
@@ -611,9 +628,14 @@ multiclass sme_quarter_outer_product_i8_i32<bit zn_u, bit zm_u, bit subtr, strin
ZZ_b_mul_r_Lo, ZZ_b_mul_r_Hi, mnemonic>;
}
-multiclass sme_quarter_outer_product_i16_i32<bit unsigned, bit subtr, string mnemonic>{
+multiclass sme_quarter_outer_product_i16_i32<bit unsigned, bit subtr, string mnemonic, SDPatternOperator op>{
def _MZZ_HToS : sme_quarter_outer_product_i16_i32<unsigned, 0b0, 0b0, subtr,
- ZPR16Mul2_Lo, ZPR16Mul2_Hi, mnemonic>;
+ ZPR16Mul2_Lo, ZPR16Mul2_Hi, mnemonic>, SMEPseudo2Instr<NAME # _MZZ_HToS, 1>;
+
+ def NAME # _MZZ_HToS # _PSEUDO : sme2_quarter_tile_outer_product_pseudo<ZPR16Mul2_Lo, ZPR16Mul2_Hi, SMEMatrixTileS>, SMEPseudo2Instr<NAME # _MZZ_HToS, 0>;
+
+ def : SME2_ZA_Tile_TwoVec_Pat<NAME # _MZZ_HToS, op, timm32_0_3, nxv8i16>;
+
def _M2ZZ_HToS : sme_quarter_outer_product_i16_i32<unsigned, 0b1, 0b0, subtr,
ZZ_h_mul_r_Lo, ZPR16Mul2_Hi, mnemonic>;
def _MZ2Z_HToS : sme_quarter_outer_product_i16_i32<unsigned, 0b0, 0b1, subtr,
@@ -622,9 +644,14 @@ multiclass sme_quarter_outer_product_i16_i32<bit unsigned, bit subtr, string mne
ZZ_h_mul_r_Lo, ZZ_h_mul_r_Hi, mnemonic>;
}
-multiclass sme_quarter_outer_product_i64<bit zn_u, bit zm_u, bit subtr, string mnemonic>{
+multiclass sme_quarter_outer_product_i64<bit zn_u, bit zm_u, bit subtr, string mnemonic, SDPatternOperator op>{
def _MZZ_HtoD : sme_quarter_outer_product_i64<{zn_u, 0}, {zm_u, 0}, subtr,
- ZPR16Mul2_Lo, ZPR16Mul2_Hi, mnemonic>;
+ ZPR16Mul2_Lo, ZPR16Mul2_Hi, mnemonic>, SMEPseudo2Instr<NAME # _MZZ_HtoD, 1>;
+
+ def NAME # _MZZ_HtoD # _PSEUDO : sme2_quarter_tile_outer_product_pseudo<ZPR16Mul2_Lo, ZPR16Mul2_Hi, SMEMatrixTileD>, SMEPseudo2Instr<NAME # _MZZ_HtoD, 0>;
+
+ def : SME2_ZA_Tile_TwoVec_Pat<NAME # _MZZ_HtoD, op, timm32_0_3, nxv8i16>;
+
def _M2ZZ_HtoD : sme_quarter_outer_product_i64<{zn_u, 1}, {zm_u, 0}, subtr,
ZZ_h_mul_r_Lo, ZPR16Mul2_Hi, mnemonic>;
def _MZ2Z_HtoD : sme_quarter_outer_product_i64<{zn_u, 0}, {zm_u, 1}, subtr,
@@ -2231,7 +2258,7 @@ multiclass sme2_int_mla_long_array_vg2_single<string mnemonic, bits<2> op, SDPat
multiclass sme2_fp_mla_long_array_vg4_single<string mnemonic, bits<3> op, MatrixOperand matrix_ty,
RegisterOperand multi_vector_ty, ZPRRegOp vector_ty,
ValueType zpr_ty, SDPatternOperator intrinsic, list<Register> uses=[]> {
- def NAME : sme2_mla_long_array_vg24_single<0b00, 0b1, op{2-1}, op{0}, matrix_ty, multi_vector_ty,
+ def NAME : sme2_mla_long_array_vg24_single<0b00, 0b1, op{2-1}, op{0}, matrix_ty, multi_vector_ty,
vector_ty, mnemonic, "vgx4">, SMEPseudo2Instr<NAME, 1> {
let Uses = uses;
}
@@ -5304,7 +5331,7 @@ multiclass sme2p1_zero_matrix<string mnemonic> {
def : SME2_Zero_Matrix_Pat<NAME # _4Z_PSEUDO, int_aarch64_sme_zero_za64_vg4x1, uimm1s4range, tileslicerange1s4>;
def : SME2_Zero_Matrix_Pat<NAME # _VG2_4Z_PSEUDO, int_aarch64_sme_zero_za64_vg4x2, uimm0s4range, tileslicerange0s4>;
def : SME2_Zero_Matrix_Pat<NAME # _VG4_4Z_PSEUDO, int_aarch64_sme_zero_za64_vg4x4, uimm0s4range, tileslicerange0s4>;
-}
+}
//===----------------------------------------------------------------------===//
// SME2.1 lookup table expand two non-contiguous registers
@@ -5470,9 +5497,13 @@ class sme2_bf16_fp32_quarter_tile_outer_product<bit M, bit N, bit S, string mnem
let Constraints = "$ZAda = $_ZAda";
}
-multiclass sme2_bfmop4as_widening<bit S, string mnemonic> {
+multiclass sme2_bfmop4as_widening<bit S, string mnemonic, SDPatternOperator op> {
// Single vectors
- def _MZZ_S : sme2_bf16_fp32_quarter_tile_outer_product<0, 0, S, mnemonic, ZPR16Mul2_Lo, ZPR16Mul2_Hi>;
+ def _MZZ_S : sme2_bf16_fp32_quarter_tile_outer_product<0, 0, S, mnemonic, ZPR16Mul2_Lo, ZPR16Mul2_Hi>, SMEPseudo2Instr<NAME # _MZZ_S, 1>;
+
+ def NAME # _MZZ_S # _PSEUDO : sme2_quarter_tile_outer_product_pseudo<ZPR16Mul2_Lo, ZPR16Mul2_Hi, SMEMatrixTileS>, SMEPseudo2Instr<NAME # _MZZ_S, 0>;
+
+ def : SME2_ZA_Tile_TwoVec_Pat<NAME # _MZZ_S, op, timm32_0_3, nxv8bf16>;
// Multiple and single vectors
def _M2ZZ_S : sme2_bf16_fp32_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZPR16Mul2_Hi>;
@@ -5617,9 +5648,13 @@ class sme2_fp16_quarter_tile_outer_product<bit M, bit N, bit S, string mnemonic,
let Constraints = "$ZAda = $_ZAda";
}
-multiclass sme2_fmop4as_fp16_non_widening<bit S, string mnemonic> {
+multiclass sme2_fmop4as_fp16_non_widening<bit S, string mnemonic, SDPatternOperator op> {
// Single vectors
- def _MZZ_H : sme2_fp16_quarter_tile_outer_product<0, 0, S, mnemonic, ZPR16Mul2_Lo, ZPR16Mul2_Hi>;
+ def _MZZ_H : sme2_fp16_quarter_tile_outer_product<0, 0, S, mnemonic, ZPR16Mul2_Lo, ZPR16Mul2_Hi>, SMEPseudo2Instr<NAME # _MZZ_H, 1>;
+
+ def NAME # _MZZ_H # _PSEUDO : sme2_quarter_tile_outer_product_pseudo<ZPR16Mul2_Lo, ZPR16Mul2_Hi, SMEMatrixTileH>, SMEPseudo2Instr<NAME # _MZZ_H, 0>;
+
+ def : SME2_ZA_Tile_TwoVec_Pat<NAME # _MZZ_H, op, timm32_0_3, nxv8f16>;
// Multiple and single vectors
def _M2ZZ_H : sme2_fp16_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZPR16Mul2_Hi>;
@@ -5689,9 +5724,13 @@ class sme2_bf16_fp16_quarter_tile_outer_product<bit M, bit N, bit S, string mnem
let Constraints = "$ZAda = $_ZAda";
}
-multiclass sme2_bfmop4as_non_widening<bit S, string mnemonic> {
+multiclass sme2_bfmop4as_non_widening<bit S, string mnemonic, SDPatternOperator op> {
// Single vectors
- def _MZZ_H : sme2_bf16_fp16_quarter_tile_outer_product<0, 0, S, mnemonic, ZPR16Mul2_Lo, ZPR16Mul2_Hi>;
+ def _MZZ_H : sme2_bf16_fp16_quarter_tile_outer_product<0, 0, S, mnemonic, ZPR16Mul2_Lo, ZPR16Mul2_Hi>, SMEPseudo2Instr<NAME # _MZZ_H, 1>;
+
+ def NAME # _MZZ_H # _PSEUDO : sme2_quarter_tile_outer_product_pseudo<ZPR16Mul2_Lo, ZPR16Mul2_Hi, SMEMatrixTileH>, SMEPseudo2Instr<NAME # _MZZ_H, 0>;
+
+ def : SME2_ZA_Tile_TwoVec_Pat<NAME # _MZZ_H, op, timm32_0_3, nxv8bf16>;
// Multiple and single vectors
def _M2ZZ_H : sme2_bf16_fp16_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZPR16Mul2_Hi>;
@@ -5726,9 +5765,13 @@ class sme2_fp32_quarter_tile_outer_product<bit M, bit N, bit S, string mnemonic,
let Constraints = "$ZAda = $_ZAda";
}
-multiclass sme2_fmop4as_fp32_non_widening<bit S, string mnemonic> {
+multiclass sme2_fmop4as_fp32_non_widening<bit S, string mnemonic, SDPatternOperator op> {
// Single vectors
- def _MZZ_S : sme2_fp32_quarter_tile_outer_product<0, 0, S, mnemonic, ZPR32Mul2_Lo, ZPR32Mul2_Hi>;
+ def _MZZ_S : sme2_fp32_quarter_tile_outer_product<0, 0, S, mnemonic, ZPR32Mul2_Lo, ZPR32Mul2_Hi>, SMEPseudo2Instr<NAME # _MZZ_S, 1>;
+
+ def NAME # _MZZ_S # _PSEUDO : sme2_quarter_tile_outer_product_pseudo<ZPR16Mul2_Lo, ZPR16Mul2_Hi, SMEMatrixTileS>, SMEPseudo2Instr<NAME # _MZZ_S, 0>;
+
+ def : SME2_ZA_Tile_TwoVec_Pat<NAME # _MZZ_S, op, timm32_0_3, nxv4f32>;
// Multiple and single vectors
def _M2ZZ_S : sme2_fp32_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_s_mul_r_Lo, ZPR32Mul2_Hi>;
@@ -5763,9 +5806,13 @@ class sme2_fp64_quarter_tile_outer_product<bit M, bit N, bit S, string mnemonic,
let Constraints = "$ZAda = $_ZAda";
}
-multiclass sme2_fmop4as_fp64_non_widening<bit S, string mnemonic> {
+multiclass sme2_fmop4as_fp64_non_widening<bit S, string mnemonic, SDPatternOperator op> {
// Single vectors
- def _MZZ_D : sme2_fp64_quarter_tile_outer_product<0, 0, S, mnemonic, ZPR64Mul2_Lo, ZPR64Mul2_Hi>;
+ def _MZZ_D : sme2_fp64_quarter_tile_outer_product<0, 0, S, mnemonic, ZPR64Mul2_Lo, ZPR64Mul2_Hi>, SMEPseudo2Instr<NAME # _MZZ_D, 1>;
+
+ def NAME # _MZZ_D # _PSEUDO : sme2_quarter_tile_outer_product_pseudo<ZPR64Mul2_Lo, ZPR64Mul2_Hi, SMEMatrixTileD>, SMEPseudo2Instr<NAME # _MZZ_D, 0>;
+
+ def : SME2_ZA_Tile_TwoVec_Pat<NAME # _MZZ_D, op, timm32_0_3, nxv2f64>;
// Multiple and single vectors
def _M2ZZ_D : sme2_fp64_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_d_mul_r_Lo, ZPR64Mul2_Hi>;
@@ -5800,9 +5847,13 @@ class sme2_fp16_fp32_quarter_tile_outer_product<bit M, bit N, bit S, string mnem
let Constraints = "$ZAda = $_ZAda";
}
-multiclass sme2_fmop4as_fp16_fp32_widening<bit S, string mnemonic> {
+multiclass sme2_fmop4as_fp16_fp32_widening<bit S, string mnemonic, SDPatternOperator op> {
// Single vectors
- def _MZZ_HtoS : sme2_fp16_fp32_quarter_tile_outer_product<0, 0, S, mnemonic, ZPR16Mul2_Lo, ZPR16Mul2_Hi>;
+ def _MZZ_HtoS : sme2_fp16_fp32_quarter_tile_outer_product<0, 0, S, mnemonic, ZPR16Mul2_Lo, ZPR16Mul2_Hi>, SMEPseudo2Instr<NAME # _MZZ_HtoS, 1>;
+
+ def NAME # _MZZ_HtoS # _PSEUDO : sme2_quarter_tile_outer_product_pseudo<ZPR16Mul2_Lo, ZPR16Mul2_Hi, SMEMatrixTileS>, SMEPseudo2Instr<NAME # _MZZ_HtoS, 0>;
+
+ def : SME2_ZA_Tile_TwoVec_Pat<NAME # _MZZ_HtoS, op, timm32_0_3, nxv8f16>;
// Multiple and single vectors
def _M2ZZ_HtoS : sme2_fp16_fp32_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZPR16Mul2_Hi>;
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_1x1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_1x1.ll
new file mode 100644
index 0000000000000..df985675f3070
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_1x1.ll
@@ -0,0 +1,247 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -force-streaming -verify-machineinstrs < %s | FileCheck %s
+
+target triple = "aarch64-linux"
+
+; Widening
+define void @mop4a_za32_s8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) #0 {
+; CHECK-LABEL: mop4a_za32_s8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: smop4a za1.s, z0.b, z24.b
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv16i8(i32 1, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
+ ret void
+}
+
+define void @mop4s_za32_s8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) #0 {
+; CHECK-LABEL: mop4s_za32_s8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: smop4s za1.s, z0.b, z24.b
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv16i8(i32 1, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
+ ret void
+}
+
+define void @mop4a_za32_u8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) #0 {
+; CHECK-LABEL: mop4a_za32_u8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: smop4a za1.s, z0.b, z24.b
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv16i8(i32 1, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
+ ret void
+}
+
+define void @mop4s_za32_u8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) #0 {
+; CHECK-LABEL: mop4s_za32_u8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: smop4s za1.s, z0.b, z24.b
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv16i8(i32 1, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
+ ret void
+}
+
+define void @mop4a_za32_s16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
+; CHECK-LABEL: mop4a_za32_s16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: smop4a za1.s, z0.h, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ ret void
+}
+
+define void @mop4s_za32_s16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
+; CHECK-LABEL: mop4s_za32_s16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: smop4s za1.s, z0.h, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ ret void
+}
+
+define void @mop4a_za32_u16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
+; CHECK-LABEL: mop4a_za32_u16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: smop4a za1.s, z0.h, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ ret void
+}
+
+define void @mop4s_za32_u16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
+; CHECK-LABEL: mop4s_za32_u16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: smop4s za1.s, z0.h, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ ret void
+}
+
+define void @mop4a_za32_f16(<vscale x 8 x half> %zn, <vscale x 8 x half> %zm) #0 {
+; CHECK-LABEL: mop4a_za32_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: fmop4a za1.s, z0.h, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8f16(i32 1, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm)
+ ret void
+}
+
+define void @mop4s_za32_f16(<vscale x 8 x half> %zn, <vscale x 8 x half> %zm) #0 {
+; CHECK-LABEL: mop4s_za32_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: fmop4s za1.s, z0.h, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8f16(i32 1, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm)
+ ret void
+}
+
+define void @mop4a_za32_bf16(<vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) #0 {
+; CHECK-LABEL: mop4a_za32_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: bfmop4a za1.s, z0.h, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8bf16(i32 1, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
+ ret void
+}
+
+define void @mop4s_za32_bf16(<vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) #0 {
+; CHECK-LABEL: mop4s_za32_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: bfmop4s za1.s, z0.h, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8bf16(i32 1, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
+ ret void
+}
+
+define void @mop4a_za64_s16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
+; CHECK-LABEL: mop4a_za64_s16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: smop4a za1.s, z0.h, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ ret void
+}
+
+define void @mop4s_za64_s16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
+; CHECK-LABEL: mop4s_za64_s16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: smop4s za1.s, z0.h, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ ret void
+}
+
+define void @mop4a_za64_u16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
+; CHECK-LABEL: mop4a_za64_u16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: smop4a za1.s, z0.h, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ ret void
+}
+
+define void @mop4s_za64_u16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
+; CHECK-LABEL: mop4s_za64_u16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: smop4s za1.s, z0.h, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ ret void
+}
+
+; Non-widening
+define void @mop4a_za16_f16(<vscale x 8 x half> %zn, <vscale x 8 x half> %zm) #0 {
+; CHECK-LABEL: mop4a_za16_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: fmop4a za1.h, z0.h, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4a.1x1.nxv8f16(i32 1, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm)
+ ret void
+}
+
+define void @mop4s_za16_f16(<vscale x 8 x half> %zn, <vscale x 8 x half> %zm) #0 {
+; CHECK-LABEL: mop4s_za16_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: fmop4s za1.h, z0.h, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4s.1x1.nxv8f16(i32 1, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm)
+ ret void
+}
+
+define void @mop4a_za32_f32(<vscale x 4 x float> %zn, <vscale x 4 x float> %zm) #0 {
+; CHECK-LABEL: mop4a_za32_f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: fmop4a za1.s, z0.s, z24.s
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4a.1x1.nxv4f32(i32 1, <vscale x 4 x float> %zn, <vscale x 4 x float> %zm)
+ ret void
+}
+
+define void @mop4s_za32_f32(<vscale x 4 x float> %zn, <vscale x 4 x float> %zm) #0 {
+; CHECK-LABEL: mop4s_za32_f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: fmop4s za1.s, z0.s, z24.s
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4s.1x1.nxv4f32(i32 1, <vscale x 4 x float> %zn, <vscale x 4 x float> %zm)
+ ret void
+}
+
+define void @mop4a_za64_f64(<vscale x 2 x double> %zn, <vscale x 2 x double> %zm) #0 {
+; CHECK-LABEL: mop4a_za64_f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: fmop4a za1.d, z0.d, z24.d
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4a.1x1.nxv2f64(i32 1, <vscale x 2 x double> %zn, <vscale x 2 x double> %zm)
+ ret void
+}
+
+define void @mop4s_za64_f64(<vscale x 2 x double> %zn, <vscale x 2 x double> %zm) #0 {
+; CHECK-LABEL: mop4s_za64_f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: fmop4s za1.d, z0.d, z24.d
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4s.1x1.nxv2f64(i32 1, <vscale x 2 x double> %zn, <vscale x 2 x double> %zm)
+ ret void
+}
+
+define void @mop4a_za16_bf16(<vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) #0 {
+; CHECK-LABEL: mop4a_za16_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: bfmop4a za1.h, z0.h, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4a.1x1.nxv8bf16(i32 1, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
+ ret void
+}
+
+define void @mop4s_za16_bf16(<vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) #0 {
+; CHECK-LABEL: mop4s_za16_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: bfmop4s za1.h, z0.h, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4s.1x1.nxv8bf16(i32 1, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
+ ret void
+}
+attributes #0 = {nounwind "target-features" = "+sme-i16i64,+sme-f64f64,+sme-b16b16,+sme2p1,+bf16,+sme-f16f16,+sme-mop4" }
>From 228b75729a799d929a114fbebbcad223520c7d77 Mon Sep 17 00:00:00 2001
From: Virginia Cangelosi <virginia.cangelosi at arm.com>
Date: Wed, 19 Feb 2025 11:58:46 +0000
Subject: [PATCH 02/11] Add white spaces back in to simply patch
---
llvm/include/llvm/IR/IntrinsicsAArch64.td | 38 +++++++++++-----------
llvm/lib/Target/AArch64/SMEInstrFormats.td | 6 ++--
2 files changed, 22 insertions(+), 22 deletions(-)
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 0714602a2f09b..24052d8a45d75 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -1497,7 +1497,7 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.".
LLVMSubdivide2VectorType<0>,
llvm_i32_ty],
[IntrNoMem, ImmArg<ArgIndex<3>>]>;
-
+
class SVE2_1VectorArgIndexed_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty],
[LLVMMatchType<0>,
@@ -1512,7 +1512,7 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.".
llvm_i32_ty,
llvm_i32_ty],
[IntrNoMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
-
+
class SVE2_1VectorArg_Pred_Intrinsic
: DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
[llvm_anyvector_ty],
@@ -1522,7 +1522,7 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.".
: DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
[llvm_anyvector_ty, llvm_i32_ty],
[IntrNoMem, ImmArg<ArgIndex<1>>]>;
-
+
class SVE2_Pred_1VectorArgIndexed_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty],
[LLVMMatchType<0>,
@@ -3330,11 +3330,11 @@ let TargetPrefix = "aarch64" in {
: DefaultAttrsIntrinsic<[llvm_nxv8bf16_ty],
[llvm_nxv4f32_ty, llvm_nxv4f32_ty],
[IntrNoMem]>;
-
+
class SME2_CVT_WIDENING_VG2_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
[LLVMSubdivide2VectorType<0>], [IntrNoMem]>;
-
+
class SME2_CVT_VG4_SINGLE_Intrinsic
: DefaultAttrsIntrinsic<[LLVMSubdivide4VectorType<0>],
@@ -3575,7 +3575,7 @@ let TargetPrefix = "aarch64" in {
foreach vg = ["vg1x2", "vg1x4", "vg2x1", "vg2x2", "vg2x4", "vg4x1", "vg4x2", "vg4x4"] in {
def int_aarch64_sme_zero_za64_ # vg : DefaultAttrsIntrinsic<[], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects]>;
}
-
+
// Multi-vector signed saturating doubling multiply high
def int_aarch64_sve_sqdmulh_single_vgx2 : SME2_VG2_Multi_Single_Intrinsic;
@@ -3645,7 +3645,7 @@ let TargetPrefix = "aarch64" in {
//
//Multi-vector floating-point convert from half-precision to deinterleaved single-precision.
//
-
+
def int_aarch64_sve_fcvtl_widen_x2 : SME2_CVT_WIDENING_VG2_Intrinsic;
//
@@ -3837,7 +3837,7 @@ let TargetPrefix = "aarch64" in {
def int_aarch64_sme_luti4_lane_zt
: DefaultAttrsIntrinsic<[llvm_anyvector_ty], [llvm_i32_ty, llvm_nxv16i8_ty, llvm_i32_ty],
[ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, IntrReadMem]>;
-
+
// Lookup table expand two registers
//
def int_aarch64_sme_luti2_lane_zt_x2
@@ -3864,7 +3864,7 @@ let TargetPrefix = "aarch64" in {
[llvm_i32_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty],
[ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>;
-
+
//
// Register scaling
//
@@ -3912,7 +3912,7 @@ def int_aarch64_sve_extq : AdvSIMD_2VectorArgIndexed_Intrinsic;
//
// SVE2.1 - Move predicate to/from vector
//
-def int_aarch64_sve_pmov_to_pred_lane : SVE2_1VectorArgIndexed_Pred_Intrinsic;
+def int_aarch64_sve_pmov_to_pred_lane : SVE2_1VectorArgIndexed_Pred_Intrinsic;
def int_aarch64_sve_pmov_to_pred_lane_zero : SVE2_1VectorArg_Pred_Intrinsic;
@@ -3954,10 +3954,10 @@ let TargetPrefix = "aarch64" in {
: DefaultAttrsIntrinsic<[llvm_nxv16i8_ty],
[llvm_anyvector_ty, LLVMMatchType<0>],
[IntrReadMem, IntrInaccessibleMemOnly]>;
-
+
def int_aarch64_sve_fp8_cvtn : SVE2_FP8_Narrow_Cvt;
def int_aarch64_sve_fp8_cvtnb : SVE2_FP8_Narrow_Cvt;
-
+
def int_aarch64_sve_fp8_cvtnt
: DefaultAttrsIntrinsic<[llvm_nxv16i8_ty],
[llvm_nxv16i8_ty, llvm_anyvector_ty, LLVMMatchType<0>],
@@ -3969,32 +3969,32 @@ let TargetPrefix = "aarch64" in {
[LLVMMatchType<0>,
llvm_nxv16i8_ty, llvm_nxv16i8_ty],
[IntrReadMem, IntrInaccessibleMemOnly]>;
-
+
class SVE2_FP8_FMLA_FDOT_Lane
: DefaultAttrsIntrinsic<[llvm_anyvector_ty],
[LLVMMatchType<0>,
llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_i32_ty],
[IntrReadMem, IntrInaccessibleMemOnly, ImmArg<ArgIndex<3>>]>;
-
+
def int_aarch64_sve_fp8_fdot : SVE2_FP8_FMLA_FDOT;
def int_aarch64_sve_fp8_fdot_lane : SVE2_FP8_FMLA_FDOT_Lane;
// Fused multiply-add
def int_aarch64_sve_fp8_fmlalb : SVE2_FP8_FMLA_FDOT;
def int_aarch64_sve_fp8_fmlalb_lane : SVE2_FP8_FMLA_FDOT_Lane;
-
+
def int_aarch64_sve_fp8_fmlalt : SVE2_FP8_FMLA_FDOT;
def int_aarch64_sve_fp8_fmlalt_lane : SVE2_FP8_FMLA_FDOT_Lane;
-
+
def int_aarch64_sve_fp8_fmlallbb : SVE2_FP8_FMLA_FDOT;
def int_aarch64_sve_fp8_fmlallbb_lane : SVE2_FP8_FMLA_FDOT_Lane;
-
+
def int_aarch64_sve_fp8_fmlallbt : SVE2_FP8_FMLA_FDOT;
def int_aarch64_sve_fp8_fmlallbt_lane : SVE2_FP8_FMLA_FDOT_Lane;
-
+
def int_aarch64_sve_fp8_fmlalltb : SVE2_FP8_FMLA_FDOT;
def int_aarch64_sve_fp8_fmlalltb_lane : SVE2_FP8_FMLA_FDOT_Lane;
-
+
def int_aarch64_sve_fp8_fmlalltt : SVE2_FP8_FMLA_FDOT;
def int_aarch64_sve_fp8_fmlalltt_lane : SVE2_FP8_FMLA_FDOT_Lane;
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index 5a3d12e9f7b8b..755531505636d 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -260,7 +260,7 @@ class SME2_Tile_VG4_Multi_Pat<string name, SDPatternOperator intrinsic, Operand
class SME2_Zero_Matrix_Pat<string name, SDPatternOperator intrinsic, Operand offset_ty, ComplexPattern tileslice>
: Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, offset_ty:$offset))),
- (!cast<Instruction>(name) $base, $offset)>;
+ (!cast<Instruction>(name) $base, $offset)>;
class SME2_Tile_Movaz_Pat<string name, SDPatternOperator intrinsic, ValueType out_vt, Operand tile_imm, Operand index_ty, ComplexPattern tileslice>
: Pat<(out_vt (intrinsic tile_imm:$tile, (i32 (tileslice MatrixIndexGPR32Op12_15:$base, index_ty:$offset)))),
@@ -2258,7 +2258,7 @@ multiclass sme2_int_mla_long_array_vg2_single<string mnemonic, bits<2> op, SDPat
multiclass sme2_fp_mla_long_array_vg4_single<string mnemonic, bits<3> op, MatrixOperand matrix_ty,
RegisterOperand multi_vector_ty, ZPRRegOp vector_ty,
ValueType zpr_ty, SDPatternOperator intrinsic, list<Register> uses=[]> {
- def NAME : sme2_mla_long_array_vg24_single<0b00, 0b1, op{2-1}, op{0}, matrix_ty, multi_vector_ty,
+ def NAME : sme2_mla_long_array_vg24_single<0b00, 0b1, op{2-1}, op{0}, matrix_ty, multi_vector_ty,
vector_ty, mnemonic, "vgx4">, SMEPseudo2Instr<NAME, 1> {
let Uses = uses;
}
@@ -5331,7 +5331,7 @@ multiclass sme2p1_zero_matrix<string mnemonic> {
def : SME2_Zero_Matrix_Pat<NAME # _4Z_PSEUDO, int_aarch64_sme_zero_za64_vg4x1, uimm1s4range, tileslicerange1s4>;
def : SME2_Zero_Matrix_Pat<NAME # _VG2_4Z_PSEUDO, int_aarch64_sme_zero_za64_vg4x2, uimm0s4range, tileslicerange0s4>;
def : SME2_Zero_Matrix_Pat<NAME # _VG4_4Z_PSEUDO, int_aarch64_sme_zero_za64_vg4x4, uimm0s4range, tileslicerange0s4>;
-}
+}
//===----------------------------------------------------------------------===//
// SME2.1 lookup table expand two non-contiguous registers
>From e7dd7caa157b39074eaf5d09aded9708cdd794d5 Mon Sep 17 00:00:00 2001
From: Virginia Cangelosi <virginia.cangelosi at arm.com>
Date: Mon, 24 Feb 2025 13:33:42 +0000
Subject: [PATCH 03/11] Add extra intrinsics to differentiate u/smop4
---
clang/include/clang/Basic/arm_sme.td | 26 +++---
.../sme2-intrinsics/acle_sme2_mop4_1x1.c | 80 +++++++++----------
llvm/include/llvm/IR/IntrinsicsAArch64.td | 26 ++++--
.../lib/Target/AArch64/AArch64SMEInstrInfo.td | 66 +++++++--------
llvm/lib/Target/AArch64/SMEInstrFormats.td | 56 ++++++-------
.../AArch64/sme2-intrinsics-mop4a_1x1.ll | 36 ++++-----
6 files changed, 153 insertions(+), 137 deletions(-)
diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index 2af29ad6699b6..bf76e4ce53182 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -384,7 +384,7 @@ multiclass MOP4MixedSignsSingleSingle<string n_suffix1, string n_suffix2, string
def NAME : SInst<"sv" # n_suffix2 # "_1x1_" # za # "[_{2}_{3}]",
"vid" # !cond(!eq(n_suffix1, "su") : "u", true: "x"),
!cond(!eq(n_suffix1, "su") : "", true: "U") # t,
- MergeNone, "aarch64_sme_" # n_suffix2 # "_wide_1x1",
+ MergeNone, "aarch64_sme_" # n_suffix1 # n_suffix2 # "_wide_1x1",
[IsStreaming, IsInOutZA],
[ImmCheck<0, ImmCheck0_3>]>;
}
@@ -400,15 +400,15 @@ let SMETargetGuard = "sme2" in {
def SVBMOPS : Inst<"svbmops_za32[_{d}]_m", "viPPdd", "iUi", MergeNone, "aarch64_sme_bmops_za32", [IsInOutZA, IsStreaming], [ImmCheck<0, ImmCheck0_3>]>;
- defm SVSMOP4A_MZZ_HtoS : MOP4SingleSingle<"a", "za32", "s", "aarch64_sme_mop4a", "_wide">;
- defm SVSMOP4S_MZZ_HtoS : MOP4SingleSingle<"s", "za32", "s", "aarch64_sme_mop4s", "_wide">;
- defm SVSMOP4A_MZZ_BToS : MOP4SingleSingle<"a", "za32", "c", "aarch64_sme_mop4a", "_wide">;
- defm SVSMOP4S_MZZ_BToS : MOP4SingleSingle<"s", "za32", "c", "aarch64_sme_mop4s", "_wide">;
+ defm SVSMOP4A_MZZ_HtoS : MOP4SingleSingle<"a", "za32", "s", "aarch64_sme_smop4a", "_wide">;
+ defm SVSMOP4S_MZZ_HtoS : MOP4SingleSingle<"s", "za32", "s", "aarch64_sme_smop4s", "_wide">;
+ defm SVSMOP4A_MZZ_BToS : MOP4SingleSingle<"a", "za32", "c", "aarch64_sme_smop4a", "_wide">;
+ defm SVSMOP4S_MZZ_BToS : MOP4SingleSingle<"s", "za32", "c", "aarch64_sme_smop4s", "_wide">;
- defm SVUMOP4A_MZZ_HtoS : MOP4SingleSingle<"a", "za32", "Us", "aarch64_sme_mop4a", "_wide">;
- defm SVUMOP4S_MZZ_HtoS : MOP4SingleSingle<"s", "za32", "Us", "aarch64_sme_mop4s", "_wide">;
- defm SVUMOP4A_MZZ_BToS : MOP4SingleSingle<"a", "za32", "Uc", "aarch64_sme_mop4a", "_wide">;
- defm SVUMOP4S_MZZ_BToS : MOP4SingleSingle<"s", "za32", "Uc", "aarch64_sme_mop4s", "_wide">;
+ defm SVUMOP4A_MZZ_HtoS : MOP4SingleSingle<"a", "za32", "Us", "aarch64_sme_umop4a", "_wide">;
+ defm SVUMOP4S_MZZ_HtoS : MOP4SingleSingle<"s", "za32", "Us", "aarch64_sme_umop4s", "_wide">;
+ defm SVUMOP4A_MZZ_BToS : MOP4SingleSingle<"a", "za32", "Uc", "aarch64_sme_umop4a", "_wide">;
+ defm SVUMOP4S_MZZ_BToS : MOP4SingleSingle<"s", "za32", "Uc", "aarch64_sme_umop4s", "_wide">;
defm SVFMOP4A_MZZ_HtoS : MOP4SingleSingle<"a", "za32", "h", "aarch64_sme_mop4a", "_wide">;
defm SVFMOP4S_MZZ_HtoS : MOP4SingleSingle<"s", "za32", "h", "aarch64_sme_mop4s", "_wide">;
@@ -473,10 +473,10 @@ let SMETargetGuard = "sme2" in {
}
let SMETargetGuard = "sme2,sme-i16i64" in {
- defm SVSMOP4A_MZZ_HtoD : MOP4SingleSingle<"a", "za64", "s", "aarch64_sme_mop4a", "_wide">;
- defm SVSMOP4S_MZZ_HtoD : MOP4SingleSingle<"s", "za64", "s", "aarch64_sme_mop4s", "_wide">;
- defm SVUMOP4A_MZZ_HtoD : MOP4SingleSingle<"a", "za64", "Us", "aarch64_sme_mop4a", "_wide">;
- defm SVUMOP4S_MZZ_HtoD : MOP4SingleSingle<"s", "za64", "Us", "aarch64_sme_mop4s", "_wide">;
+ defm SVSMOP4A_MZZ_HtoD : MOP4SingleSingle<"a", "za64", "s", "aarch64_sme_smop4a", "_wide">;
+ defm SVSMOP4S_MZZ_HtoD : MOP4SingleSingle<"s", "za64", "s", "aarch64_sme_smop4s", "_wide">;
+ defm SVUMOP4A_MZZ_HtoD : MOP4SingleSingle<"a", "za64", "Us", "aarch64_sme_umop4a", "_wide">;
+ defm SVUMOP4S_MZZ_HtoD : MOP4SingleSingle<"s", "za64", "Us", "aarch64_sme_umop4s", "_wide">;
defm SVSUMOP4A_MZZ_HtoD : MOP4MixedSignsSingleSingle<"su", "mop4a", "za64", "s">;
defm SVUSMOP4A_MZZ_HtoD : MOP4MixedSignsSingleSingle<"us", "mop4a", "za64", "s">;
defm SVSUMOP4S_MZZ_HtoD : MOP4MixedSignsSingleSingle<"su", "mop4s", "za64", "s">;
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_1x1.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_1x1.c
index 37238053009fd..eac3648f90368 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_1x1.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_1x1.c
@@ -18,12 +18,12 @@
// CHECK-LABEL: @test_svmop4a_1x1_za32_s8_s8(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z27test_svmop4a_1x1_za32_s8_s8u10__SVInt8_tS_(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_1x1_za32_s8_s8(svint8_t zn, svint8_t zm) __arm_streaming __arm_inout("za") {
@@ -32,12 +32,12 @@ void test_svmop4a_1x1_za32_s8_s8(svint8_t zn, svint8_t zm) __arm_streaming __arm
// CHECK-LABEL: @test_svmop4s_1x1_za32_s8_s8(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z27test_svmop4s_1x1_za32_s8_s8u10__SVInt8_tS_(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4s_1x1_za32_s8_s8(svint8_t zn, svint8_t zm) __arm_streaming __arm_inout("za") {
@@ -46,12 +46,12 @@ void test_svmop4s_1x1_za32_s8_s8(svint8_t zn, svint8_t zm) __arm_streaming __arm
// CHECK-LABEL: @test_svmop4a_1x1_za32_u8_u8(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z27test_svmop4a_1x1_za32_u8_u8u11__SVUint8_tS_(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_1x1_za32_u8_u8(svuint8_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") {
@@ -60,12 +60,12 @@ void test_svmop4a_1x1_za32_u8_u8(svuint8_t zn, svuint8_t zm) __arm_streaming __a
// CHECK-LABEL: @test_svmop4s_1x1_za32_u8_u8(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z27test_svmop4s_1x1_za32_u8_u8u11__SVUint8_tS_(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4s_1x1_za32_u8_u8(svuint8_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") {
@@ -74,12 +74,12 @@ void test_svmop4s_1x1_za32_u8_u8(svuint8_t zn, svuint8_t zm) __arm_streaming __a
// CHECK-LABEL: @test_svmop4a_1x1_za32_s16_s16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x1_za32_s16_s16u11__SVInt16_tS_(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_1x1_za32_s16_s16(svint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za") {
@@ -88,12 +88,12 @@ void test_svmop4a_1x1_za32_s16_s16(svint16_t zn, svint16_t zm) __arm_streaming _
// CHECK-LABEL: @test_svmop4s_1x1_za32_s16_s16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x1_za32_s16_s16u11__SVInt16_tS_(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4s_1x1_za32_s16_s16(svint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za") {
@@ -102,12 +102,12 @@ void test_svmop4s_1x1_za32_s16_s16(svint16_t zn, svint16_t zm) __arm_streaming _
// CHECK-LABEL: @test_svmop4a_1x1_za32_u16_u16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x1_za32_u16_u16u12__SVUint16_tS_(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_1x1_za32_u16_u16(svuint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") {
@@ -116,12 +116,12 @@ void test_svmop4a_1x1_za32_u16_u16(svuint16_t zn, svuint16_t zm) __arm_streaming
// CHECK-LABEL: @test_svmop4s_1x1_za32_u16_u16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x1_za32_u16_u16u12__SVUint16_tS_(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4s_1x1_za32_u16_u16(svuint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") {
@@ -186,12 +186,12 @@ void test_svmop4s_1x1_za32_bf16_bf16(svbfloat16_t zn, svbfloat16_t zm) __arm_str
// CHECK-LABEL: @test_svmop4a_1x1_za64_s16_s16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x1_za64_s16_s16u11__SVInt16_tS_(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_1x1_za64_s16_s16(svint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za") {
@@ -200,12 +200,12 @@ void test_svmop4a_1x1_za64_s16_s16(svint16_t zn, svint16_t zm) __arm_streaming _
// CHECK-LABEL: @test_svmop4s_1x1_za64_s16_s16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x1_za64_s16_s16u11__SVInt16_tS_(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4s_1x1_za64_s16_s16(svint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za") {
@@ -214,12 +214,12 @@ void test_svmop4s_1x1_za64_s16_s16(svint16_t zn, svint16_t zm) __arm_streaming _
// CHECK-LABEL: @test_svmop4a_1x1_za64_u16_u16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x1_za64_u16_u16u12__SVUint16_tS_(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_1x1_za64_u16_u16(svuint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") {
@@ -228,12 +228,12 @@ void test_svmop4a_1x1_za64_u16_u16(svuint16_t zn, svuint16_t zm) __arm_streaming
// CHECK-LABEL: @test_svmop4s_1x1_za64_u16_u16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x1_za64_u16_u16u12__SVUint16_tS_(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4s_1x1_za64_u16_u16(svuint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") {
@@ -242,12 +242,12 @@ void test_svmop4s_1x1_za64_u16_u16(svuint16_t zn, svuint16_t zm) __arm_streaming
// CHECK-LABEL: @test_svmop4a_1x1_za64_s16_u16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4a.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x1_za64_s16_u16u11__SVInt16_tu12__SVUint16_t(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4a.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_1x1_za64_s16_u16(svint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") {
@@ -256,12 +256,12 @@ void test_svmop4a_1x1_za64_s16_u16(svint16_t zn, svuint16_t zm) __arm_streaming
// CHECK-LABEL: @test_svmop4s_1x1_za64_s16_u16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4s.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x1_za64_s16_u16u11__SVInt16_tu12__SVUint16_t(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4s.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4s_1x1_za64_s16_u16(svint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") {
@@ -270,12 +270,12 @@ void test_svmop4s_1x1_za64_s16_u16(svint16_t zn, svuint16_t zm) __arm_streaming
// CHECK-LABEL: @test_svmop4a_1x1_za64_u16_s16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4a.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x1_za64_u16_s16u12__SVUint16_tu11__SVInt16_t(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4a.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_1x1_za64_u16_s16(svuint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za") {
@@ -284,12 +284,12 @@ void test_svmop4a_1x1_za64_u16_s16(svuint16_t zn, svint16_t zm) __arm_streaming
// CHECK-LABEL: @test_svmop4s_1x1_za64_u16_s16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4s.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x1_za64_u16_s16u12__SVUint16_tu11__SVInt16_t(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4s.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4s_1x1_za64_u16_s16(svuint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za") {
@@ -298,12 +298,12 @@ void test_svmop4s_1x1_za64_u16_s16(svuint16_t zn, svint16_t zm) __arm_streaming
// CHECK-LABEL: @test_svmop4a_1x1_za32_s8_u8(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4a.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z27test_svmop4a_1x1_za32_s8_u8u10__SVInt8_tu11__SVUint8_t(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4a.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_1x1_za32_s8_u8(svint8_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") {
@@ -312,12 +312,12 @@ void test_svmop4a_1x1_za32_s8_u8(svint8_t zn, svuint8_t zm) __arm_streaming __ar
// CHECK-LABEL: @test_svmop4s_1x1_za32_s8_u8(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4s.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z27test_svmop4s_1x1_za32_s8_u8u10__SVInt8_tu11__SVUint8_t(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4s.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4s_1x1_za32_s8_u8(svint8_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") {
@@ -326,12 +326,12 @@ void test_svmop4s_1x1_za32_s8_u8(svint8_t zn, svuint8_t zm) __arm_streaming __ar
// CHECK-LABEL: @test_svmop4a_1x1_za32_u8_s8(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4a.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z27test_svmop4a_1x1_za32_u8_s8u11__SVUint8_tu10__SVInt8_t(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4a.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_1x1_za32_u8_s8(svuint8_t zn, svint8_t zm) __arm_streaming __arm_inout("za") {
@@ -340,12 +340,12 @@ void test_svmop4a_1x1_za32_u8_s8(svuint8_t zn, svint8_t zm) __arm_streaming __ar
// CHECK-LABEL: @test_svmop4s_1x1_za32_u8_s8(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4s.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z27test_svmop4s_1x1_za32_u8_s8u11__SVUint8_tu10__SVInt8_t(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4s.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4s_1x1_za32_u8_s8(svuint8_t zn, svint8_t zm) __arm_streaming __arm_inout("za") {
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 24052d8a45d75..8556ae83c79f9 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -3064,16 +3064,32 @@ let TargetPrefix = "aarch64" in {
def int_aarch64_sme_usmopa_wide : SME_OuterProduct_Intrinsic;
def int_aarch64_sme_usmops_wide : SME_OuterProduct_Intrinsic;
- class SME_OuterProduct_QuaterTile
+ class SME_OuterProduct_QuaterTile_Single
: DefaultAttrsIntrinsic<[],
[llvm_i32_ty,
llvm_anyvector_ty,
LLVMMatchType<0>], [ImmArg<ArgIndex<0>>]>;
- def int_aarch64_sme_mop4a_wide_1x1 : SME_OuterProduct_QuaterTile;
- def int_aarch64_sme_mop4s_wide_1x1 : SME_OuterProduct_QuaterTile;
- def int_aarch64_sme_mop4a_1x1 : SME_OuterProduct_QuaterTile;
- def int_aarch64_sme_mop4s_1x1 : SME_OuterProduct_QuaterTile;
+ def int_aarch64_sme_mop4a_wide_1x1 : SME_OuterProduct_QuaterTile_Single;
+ def int_aarch64_sme_mop4s_wide_1x1 : SME_OuterProduct_QuaterTile_Single;
+ def int_aarch64_sme_mop4a_1x1 : SME_OuterProduct_QuaterTile_Single;
+ def int_aarch64_sme_mop4s_1x1 : SME_OuterProduct_QuaterTile_Single;
+ def int_aarch64_sme_smop4a_wide_1x1 : SME_OuterProduct_QuaterTile_Single;
+ def int_aarch64_sme_smop4s_wide_1x1 : SME_OuterProduct_QuaterTile_Single;
+ def int_aarch64_sme_smop4a_1x1 : SME_OuterProduct_QuaterTile_Single;
+ def int_aarch64_sme_smop4s_1x1 : SME_OuterProduct_QuaterTile_Single;
+ def int_aarch64_sme_umop4a_wide_1x1 : SME_OuterProduct_QuaterTile_Single;
+ def int_aarch64_sme_umop4s_wide_1x1 : SME_OuterProduct_QuaterTile_Single;
+ def int_aarch64_sme_umop4a_1x1 : SME_OuterProduct_QuaterTile_Single;
+ def int_aarch64_sme_umop4s_1x1 : SME_OuterProduct_QuaterTile_Single;
+ def int_aarch64_sme_sumop4a_wide_1x1 : SME_OuterProduct_QuaterTile_Single;
+ def int_aarch64_sme_sumop4s_wide_1x1 : SME_OuterProduct_QuaterTile_Single;
+ def int_aarch64_sme_sumop4a_1x1 : SME_OuterProduct_QuaterTile_Single;
+ def int_aarch64_sme_sumop4s_1x1 : SME_OuterProduct_QuaterTile_Single;
+ def int_aarch64_sme_usmop4a_wide_1x1 : SME_OuterProduct_QuaterTile_Single;
+ def int_aarch64_sme_usmop4s_wide_1x1 : SME_OuterProduct_QuaterTile_Single;
+ def int_aarch64_sme_usmop4a_1x1 : SME_OuterProduct_QuaterTile_Single;
+ def int_aarch64_sme_usmop4s_1x1 : SME_OuterProduct_QuaterTile_Single;
class SME_AddVectorToTile_Intrinsic
: DefaultAttrsIntrinsic<[],
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 0673394d4daa9..c02e72e8b4c26 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -148,30 +148,30 @@ defm USMOPS_MPPZZ_D : sme_int_outer_product_i64<0b101, "usmops", int_aarch64_sme
}
let Predicates = [HasSME_MOP4] in {
- defm SMOP4A : sme_quarter_outer_product_i8_i32<0b0, 0b0, 0b0, "smop4a", int_aarch64_sme_mop4a_wide_1x1>;
- defm SMOP4S : sme_quarter_outer_product_i8_i32<0b0, 0b0, 0b1, "smop4s", int_aarch64_sme_mop4s_wide_1x1>;
- defm SUMOP4A : sme_quarter_outer_product_i8_i32<0b0, 0b1, 0b0, "sumop4a", int_aarch64_sme_mop4a_wide_1x1>;
- defm SUMOP4S : sme_quarter_outer_product_i8_i32<0b0, 0b1, 0b1, "sumop4s", int_aarch64_sme_mop4a_wide_1x1>;
- defm USMOP4A : sme_quarter_outer_product_i8_i32<0b1, 0b0, 0b0, "usmop4a", int_aarch64_sme_mop4a_wide_1x1>;
- defm USMOP4S : sme_quarter_outer_product_i8_i32<0b1, 0b0, 0b1, "usmop4s", int_aarch64_sme_mop4a_wide_1x1>;
- defm UMOP4A : sme_quarter_outer_product_i8_i32<0b1, 0b1, 0b0, "umop4a", int_aarch64_sme_mop4a_wide_1x1>;
- defm UMOP4S : sme_quarter_outer_product_i8_i32<0b1, 0b1, 0b1, "umop4s", int_aarch64_sme_mop4s_wide_1x1>;
-
- defm SMOP4A : sme_quarter_outer_product_i16_i32<0b0, 0b0, "smop4a", int_aarch64_sme_mop4a_wide_1x1>;
- defm SMOP4S : sme_quarter_outer_product_i16_i32<0b0, 0b1, "smop4s", int_aarch64_sme_mop4s_wide_1x1>;
- defm UMOP4A : sme_quarter_outer_product_i16_i32<0b1, 0b0, "umop4a", int_aarch64_sme_mop4a_wide_1x1>;
- defm UMOP4S : sme_quarter_outer_product_i16_i32<0b1, 0b1, "umop4s", int_aarch64_sme_mop4s_wide_1x1>;
+ defm SMOP4A : sme_quarter_outer_product_i8_i32<0b0, 0b0, 0b0, "smop4a", "int_aarch64_sme_smop4a_wide">;
+ defm SMOP4S : sme_quarter_outer_product_i8_i32<0b0, 0b0, 0b1, "smop4s", "int_aarch64_sme_smop4s_wide">;
+ defm SUMOP4A : sme_quarter_outer_product_i8_i32<0b0, 0b1, 0b0, "sumop4a", "int_aarch64_sme_sumop4a_wide">;
+ defm SUMOP4S : sme_quarter_outer_product_i8_i32<0b0, 0b1, 0b1, "sumop4s", "int_aarch64_sme_sumop4s_wide">;
+ defm USMOP4A : sme_quarter_outer_product_i8_i32<0b1, 0b0, 0b0, "usmop4a", "int_aarch64_sme_usmop4a_wide">;
+ defm USMOP4S : sme_quarter_outer_product_i8_i32<0b1, 0b0, 0b1, "usmop4s", "int_aarch64_sme_usmop4s_wide">;
+ defm UMOP4A : sme_quarter_outer_product_i8_i32<0b1, 0b1, 0b0, "umop4a", "int_aarch64_sme_umop4a_wide">;
+ defm UMOP4S : sme_quarter_outer_product_i8_i32<0b1, 0b1, 0b1, "umop4s", "int_aarch64_sme_umop4s_wide">;
+
+ defm SMOP4A : sme_quarter_outer_product_i16_i32<0b0, 0b0, "smop4a", "int_aarch64_sme_smop4a_wide">;
+ defm SMOP4S : sme_quarter_outer_product_i16_i32<0b0, 0b1, "smop4s", "int_aarch64_sme_smop4s_wide">;
+ defm UMOP4A : sme_quarter_outer_product_i16_i32<0b1, 0b0, "umop4a", "int_aarch64_sme_umop4a_wide">;
+ defm UMOP4S : sme_quarter_outer_product_i16_i32<0b1, 0b1, "umop4s", "int_aarch64_sme_umop4s_wide">;
}
let Predicates = [HasSME_MOP4, HasSMEI16I64] in {
- defm SMOP4A : sme_quarter_outer_product_i64<0b0, 0b0, 0b0, "smop4a", int_aarch64_sme_mop4a_wide_1x1>;
- defm SMOP4S : sme_quarter_outer_product_i64<0b0, 0b0, 0b1, "smop4s", int_aarch64_sme_mop4s_wide_1x1>;
- defm SUMOP4A : sme_quarter_outer_product_i64<0b0, 0b1, 0b0, "sumop4a", int_aarch64_sme_mop4a_wide_1x1>;
- defm SUMOP4S : sme_quarter_outer_product_i64<0b0, 0b1, 0b1, "sumop4s", int_aarch64_sme_mop4s_wide_1x1>;
- defm UMOP4A : sme_quarter_outer_product_i64<0b1, 0b1, 0b0, "umop4a", int_aarch64_sme_mop4a_wide_1x1>;
- defm UMOP4S : sme_quarter_outer_product_i64<0b1, 0b1, 0b1, "umop4s", int_aarch64_sme_mop4s_wide_1x1>;
- defm USMOP4A : sme_quarter_outer_product_i64<0b1, 0b0, 0b0, "usmop4a", int_aarch64_sme_mop4a_wide_1x1>;
- defm USMOP4S : sme_quarter_outer_product_i64<0b1, 0b0, 0b1, "usmop4s", int_aarch64_sme_mop4s_wide_1x1>;
+ defm SMOP4A : sme_quarter_outer_product_i64<0b0, 0b0, 0b0, "smop4a", "int_aarch64_sme_smop4a_wide">;
+ defm SMOP4S : sme_quarter_outer_product_i64<0b0, 0b0, 0b1, "smop4s", "int_aarch64_sme_smop4s_wide">;
+ defm SUMOP4A : sme_quarter_outer_product_i64<0b0, 0b1, 0b0, "sumop4a", "int_aarch64_sme_sumop4a_wide">;
+ defm SUMOP4S : sme_quarter_outer_product_i64<0b0, 0b1, 0b1, "sumop4s", "int_aarch64_sme_sumop4s_wide">;
+ defm UMOP4A : sme_quarter_outer_product_i64<0b1, 0b1, 0b0, "umop4a", "int_aarch64_sme_umop4a_wide">;
+ defm UMOP4S : sme_quarter_outer_product_i64<0b1, 0b1, 0b1, "umop4s", "int_aarch64_sme_umop4s_wide">;
+ defm USMOP4A : sme_quarter_outer_product_i64<0b1, 0b0, 0b0, "usmop4a", "int_aarch64_sme_usmop4a_wide">;
+ defm USMOP4S : sme_quarter_outer_product_i64<0b1, 0b0, 0b1, "usmop4s", "int_aarch64_sme_usmop4s_wide">;
}
let Predicates = [HasSME_TMOP] in {
@@ -1054,14 +1054,14 @@ let Predicates = [HasSME2, HasSVEBFSCALE] in {
}
let Predicates = [HasSME_MOP4] in {
- defm BFMOP4A : sme2_bfmop4as_widening<0, "bfmop4a", int_aarch64_sme_mop4a_wide_1x1>;
- defm BFMOP4S : sme2_bfmop4as_widening<1, "bfmop4s", int_aarch64_sme_mop4s_wide_1x1>;
+ defm BFMOP4A : sme2_bfmop4as_widening<0, "bfmop4a", "int_aarch64_sme_mop4a_wide">;
+ defm BFMOP4S : sme2_bfmop4as_widening<1, "bfmop4s", "int_aarch64_sme_mop4s_wide">;
- defm FMOP4A : sme2_fmop4as_fp16_fp32_widening<0, "fmop4a", int_aarch64_sme_mop4a_wide_1x1>;
- defm FMOP4S : sme2_fmop4as_fp16_fp32_widening<1, "fmop4s", int_aarch64_sme_mop4s_wide_1x1>;
+ defm FMOP4A : sme2_fmop4as_fp16_fp32_widening<0, "fmop4a", "int_aarch64_sme_mop4a_wide">;
+ defm FMOP4S : sme2_fmop4as_fp16_fp32_widening<1, "fmop4s", "int_aarch64_sme_mop4s_wide">;
- defm FMOP4A : sme2_fmop4as_fp32_non_widening<0, "fmop4a", int_aarch64_sme_mop4a_1x1>;
- defm FMOP4S : sme2_fmop4as_fp32_non_widening<1, "fmop4s", int_aarch64_sme_mop4s_1x1>;
+ defm FMOP4A : sme2_fmop4as_fp32_non_widening<0, "fmop4a", "int_aarch64_sme_mop4a">;
+ defm FMOP4S : sme2_fmop4as_fp32_non_widening<1, "fmop4s", "int_aarch64_sme_mop4s">;
}
let Predicates = [HasSME_TMOP] in {
@@ -1099,8 +1099,8 @@ let Predicates = [HasSME_TMOP, HasSMEF16F16] in {
}
let Predicates = [HasSME_MOP4, HasSMEF16F16] in {
- defm FMOP4A : sme2_fmop4as_fp16_non_widening<0, "fmop4a", int_aarch64_sme_mop4a_1x1>;
- defm FMOP4S : sme2_fmop4as_fp16_non_widening<1, "fmop4s", int_aarch64_sme_mop4s_1x1>;
+ defm FMOP4A : sme2_fmop4as_fp16_non_widening<0, "fmop4a", "int_aarch64_sme_mop4a">;
+ defm FMOP4S : sme2_fmop4as_fp16_non_widening<1, "fmop4s", "int_aarch64_sme_mop4s">;
}
let Predicates = [HasSME2, HasSVEBFSCALE] in {
@@ -1115,11 +1115,11 @@ let Predicates = [HasSME_MOP4, HasSMEF8F32] in {
}
let Predicates = [HasSME_MOP4, HasSMEB16B16] in {
- defm BFMOP4A : sme2_bfmop4as_non_widening<0, "bfmop4a", int_aarch64_sme_mop4a_1x1>;
- defm BFMOP4S : sme2_bfmop4as_non_widening<1, "bfmop4s", int_aarch64_sme_mop4s_1x1>;
+ defm BFMOP4A : sme2_bfmop4as_non_widening<0, "bfmop4a", "int_aarch64_sme_mop4a">;
+ defm BFMOP4S : sme2_bfmop4as_non_widening<1, "bfmop4s", "int_aarch64_sme_mop4s">;
}
let Predicates = [HasSME_MOP4, HasSMEF64F64] in {
- defm FMOP4A : sme2_fmop4as_fp64_non_widening<0, "fmop4a", int_aarch64_sme_mop4a_1x1>;
- defm FMOP4S : sme2_fmop4as_fp64_non_widening<1, "fmop4s", int_aarch64_sme_mop4s_1x1>;
+ defm FMOP4A : sme2_fmop4as_fp64_non_widening<0, "fmop4a", "int_aarch64_sme_mop4a">;
+ defm FMOP4S : sme2_fmop4as_fp64_non_widening<1, "fmop4s", "int_aarch64_sme_mop4s">;
}
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index 755531505636d..9840d36b2c0fc 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -104,7 +104,7 @@ class sme_outer_product_pseudo<ZPRRegOp zpr_ty, SMEMatrixTypeEnum za_flag>
let usesCustomInserter = 1;
}
-class sme2_quarter_tile_outer_product_pseudo<ZPRRegOp zn_ty, ZPRRegOp zm_ty, SMEMatrixTypeEnum za_flag>
+class sme2_quarter_tile_outer_product_pseudo_single_single<ZPRRegOp zn_ty, ZPRRegOp zm_ty, SMEMatrixTypeEnum za_flag>
: Pseudo<(outs), (ins i32imm:$tile,
zn_ty:$zn, zm_ty:$zm), []>,
Sched<[]> {
@@ -612,13 +612,13 @@ class sme_quarter_outer_product_i16_i32<bit u0, bit N, bit M, bit subtr, Registe
let Constraints = "$ZAda = $_ZAda";
}
-multiclass sme_quarter_outer_product_i8_i32<bit zn_u, bit zm_u, bit subtr, string mnemonic, SDPatternOperator op>{
+multiclass sme_quarter_outer_product_i8_i32<bit zn_u, bit zm_u, bit subtr, string mnemonic, string op>{
def _MZZ_BToS : sme_quarter_outer_product_i8_i32<{zn_u, 0}, {zm_u, 0}, subtr,
ZPR8Mul2_Lo, ZPR8Mul2_Hi, mnemonic>, SMEPseudo2Instr<NAME # _MZZ_BToS, 1>;
- def NAME # _MZZ_BToS # _PSEUDO : sme2_quarter_tile_outer_product_pseudo<ZPR8Mul2_Lo, ZPR8Mul2_Hi, SMEMatrixTileS>, SMEPseudo2Instr<NAME # _MZZ_BToS, 0>;
+ def NAME # _MZZ_BToS # _PSEUDO : sme2_quarter_tile_outer_product_pseudo_single_single<ZPR8Mul2_Lo, ZPR8Mul2_Hi, SMEMatrixTileS>, SMEPseudo2Instr<NAME # _MZZ_BToS, 0>;
- def : SME2_ZA_Tile_TwoVec_Pat<NAME # _MZZ_BToS, op, timm32_0_3, nxv16i8>;
+ def : SME2_ZA_Tile_TwoVec_Pat<NAME # _MZZ_BToS, !cast<SDPatternOperator>(op # "_1x1"), timm32_0_3, nxv16i8>;
def _M2ZZ_BToS : sme_quarter_outer_product_i8_i32<{zn_u, 1}, {zm_u, 0}, subtr,
ZZ_b_mul_r_Lo, ZPR8Mul2_Hi, mnemonic>;
@@ -628,13 +628,13 @@ multiclass sme_quarter_outer_product_i8_i32<bit zn_u, bit zm_u, bit subtr, strin
ZZ_b_mul_r_Lo, ZZ_b_mul_r_Hi, mnemonic>;
}
-multiclass sme_quarter_outer_product_i16_i32<bit unsigned, bit subtr, string mnemonic, SDPatternOperator op>{
+multiclass sme_quarter_outer_product_i16_i32<bit unsigned, bit subtr, string mnemonic, string op>{
def _MZZ_HToS : sme_quarter_outer_product_i16_i32<unsigned, 0b0, 0b0, subtr,
ZPR16Mul2_Lo, ZPR16Mul2_Hi, mnemonic>, SMEPseudo2Instr<NAME # _MZZ_HToS, 1>;
- def NAME # _MZZ_HToS # _PSEUDO : sme2_quarter_tile_outer_product_pseudo<ZPR16Mul2_Lo, ZPR16Mul2_Hi, SMEMatrixTileS>, SMEPseudo2Instr<NAME # _MZZ_HToS, 0>;
+ def NAME # _MZZ_HToS # _PSEUDO : sme2_quarter_tile_outer_product_pseudo_single_single<ZPR16Mul2_Lo, ZPR16Mul2_Hi, SMEMatrixTileS>, SMEPseudo2Instr<NAME # _MZZ_HToS, 0>;
- def : SME2_ZA_Tile_TwoVec_Pat<NAME # _MZZ_HToS, op, timm32_0_3, nxv8i16>;
+ def : SME2_ZA_Tile_TwoVec_Pat<NAME # _MZZ_HToS, !cast<SDPatternOperator>(op # "_1x1"), timm32_0_3, nxv8i16>;
def _M2ZZ_HToS : sme_quarter_outer_product_i16_i32<unsigned, 0b1, 0b0, subtr,
ZZ_h_mul_r_Lo, ZPR16Mul2_Hi, mnemonic>;
@@ -644,13 +644,13 @@ multiclass sme_quarter_outer_product_i16_i32<bit unsigned, bit subtr, string mne
ZZ_h_mul_r_Lo, ZZ_h_mul_r_Hi, mnemonic>;
}
-multiclass sme_quarter_outer_product_i64<bit zn_u, bit zm_u, bit subtr, string mnemonic, SDPatternOperator op>{
+multiclass sme_quarter_outer_product_i64<bit zn_u, bit zm_u, bit subtr, string mnemonic, string op>{
def _MZZ_HtoD : sme_quarter_outer_product_i64<{zn_u, 0}, {zm_u, 0}, subtr,
ZPR16Mul2_Lo, ZPR16Mul2_Hi, mnemonic>, SMEPseudo2Instr<NAME # _MZZ_HtoD, 1>;
- def NAME # _MZZ_HtoD # _PSEUDO : sme2_quarter_tile_outer_product_pseudo<ZPR16Mul2_Lo, ZPR16Mul2_Hi, SMEMatrixTileD>, SMEPseudo2Instr<NAME # _MZZ_HtoD, 0>;
+ def NAME # _MZZ_HtoD # _PSEUDO : sme2_quarter_tile_outer_product_pseudo_single_single<ZPR16Mul2_Lo, ZPR16Mul2_Hi, SMEMatrixTileD>, SMEPseudo2Instr<NAME # _MZZ_HtoD, 0>;
- def : SME2_ZA_Tile_TwoVec_Pat<NAME # _MZZ_HtoD, op, timm32_0_3, nxv8i16>;
+ def : SME2_ZA_Tile_TwoVec_Pat<NAME # _MZZ_HtoD, !cast<SDPatternOperator>(op # "_1x1"), timm32_0_3, nxv8i16>;
def _M2ZZ_HtoD : sme_quarter_outer_product_i64<{zn_u, 1}, {zm_u, 0}, subtr,
ZZ_h_mul_r_Lo, ZPR16Mul2_Hi, mnemonic>;
@@ -5497,13 +5497,13 @@ class sme2_bf16_fp32_quarter_tile_outer_product<bit M, bit N, bit S, string mnem
let Constraints = "$ZAda = $_ZAda";
}
-multiclass sme2_bfmop4as_widening<bit S, string mnemonic, SDPatternOperator op> {
+multiclass sme2_bfmop4as_widening<bit S, string mnemonic, string op> {
// Single vectors
def _MZZ_S : sme2_bf16_fp32_quarter_tile_outer_product<0, 0, S, mnemonic, ZPR16Mul2_Lo, ZPR16Mul2_Hi>, SMEPseudo2Instr<NAME # _MZZ_S, 1>;
- def NAME # _MZZ_S # _PSEUDO : sme2_quarter_tile_outer_product_pseudo<ZPR16Mul2_Lo, ZPR16Mul2_Hi, SMEMatrixTileS>, SMEPseudo2Instr<NAME # _MZZ_S, 0>;
+ def NAME # _MZZ_S # _PSEUDO : sme2_quarter_tile_outer_product_pseudo_single_single<ZPR16Mul2_Lo, ZPR16Mul2_Hi, SMEMatrixTileS>, SMEPseudo2Instr<NAME # _MZZ_S, 0>;
- def : SME2_ZA_Tile_TwoVec_Pat<NAME # _MZZ_S, op, timm32_0_3, nxv8bf16>;
+ def : SME2_ZA_Tile_TwoVec_Pat<NAME # _MZZ_S, !cast<SDPatternOperator>(op # "_1x1"), timm32_0_3, nxv8bf16>;
// Multiple and single vectors
def _M2ZZ_S : sme2_bf16_fp32_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZPR16Mul2_Hi>;
@@ -5648,13 +5648,13 @@ class sme2_fp16_quarter_tile_outer_product<bit M, bit N, bit S, string mnemonic,
let Constraints = "$ZAda = $_ZAda";
}
-multiclass sme2_fmop4as_fp16_non_widening<bit S, string mnemonic, SDPatternOperator op> {
+multiclass sme2_fmop4as_fp16_non_widening<bit S, string mnemonic, string op> {
// Single vectors
def _MZZ_H : sme2_fp16_quarter_tile_outer_product<0, 0, S, mnemonic, ZPR16Mul2_Lo, ZPR16Mul2_Hi>, SMEPseudo2Instr<NAME # _MZZ_H, 1>;
- def NAME # _MZZ_H # _PSEUDO : sme2_quarter_tile_outer_product_pseudo<ZPR16Mul2_Lo, ZPR16Mul2_Hi, SMEMatrixTileH>, SMEPseudo2Instr<NAME # _MZZ_H, 0>;
+ def NAME # _MZZ_H # _PSEUDO : sme2_quarter_tile_outer_product_pseudo_single_single<ZPR16Mul2_Lo, ZPR16Mul2_Hi, SMEMatrixTileH>, SMEPseudo2Instr<NAME # _MZZ_H, 0>;
- def : SME2_ZA_Tile_TwoVec_Pat<NAME # _MZZ_H, op, timm32_0_3, nxv8f16>;
+ def : SME2_ZA_Tile_TwoVec_Pat<NAME # _MZZ_H, !cast<SDPatternOperator>(op # "_1x1"), timm32_0_3, nxv8f16>;
// Multiple and single vectors
def _M2ZZ_H : sme2_fp16_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZPR16Mul2_Hi>;
@@ -5724,13 +5724,13 @@ class sme2_bf16_fp16_quarter_tile_outer_product<bit M, bit N, bit S, string mnem
let Constraints = "$ZAda = $_ZAda";
}
-multiclass sme2_bfmop4as_non_widening<bit S, string mnemonic, SDPatternOperator op> {
+multiclass sme2_bfmop4as_non_widening<bit S, string mnemonic, string op> {
// Single vectors
def _MZZ_H : sme2_bf16_fp16_quarter_tile_outer_product<0, 0, S, mnemonic, ZPR16Mul2_Lo, ZPR16Mul2_Hi>, SMEPseudo2Instr<NAME # _MZZ_H, 1>;
- def NAME # _MZZ_H # _PSEUDO : sme2_quarter_tile_outer_product_pseudo<ZPR16Mul2_Lo, ZPR16Mul2_Hi, SMEMatrixTileH>, SMEPseudo2Instr<NAME # _MZZ_H, 0>;
+ def NAME # _MZZ_H # _PSEUDO : sme2_quarter_tile_outer_product_pseudo_single_single<ZPR16Mul2_Lo, ZPR16Mul2_Hi, SMEMatrixTileH>, SMEPseudo2Instr<NAME # _MZZ_H, 0>;
- def : SME2_ZA_Tile_TwoVec_Pat<NAME # _MZZ_H, op, timm32_0_3, nxv8bf16>;
+ def : SME2_ZA_Tile_TwoVec_Pat<NAME # _MZZ_H, !cast<SDPatternOperator>(op # "_1x1"), timm32_0_3, nxv8bf16>;
// Multiple and single vectors
def _M2ZZ_H : sme2_bf16_fp16_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZPR16Mul2_Hi>;
@@ -5765,13 +5765,13 @@ class sme2_fp32_quarter_tile_outer_product<bit M, bit N, bit S, string mnemonic,
let Constraints = "$ZAda = $_ZAda";
}
-multiclass sme2_fmop4as_fp32_non_widening<bit S, string mnemonic, SDPatternOperator op> {
+multiclass sme2_fmop4as_fp32_non_widening<bit S, string mnemonic, string op> {
// Single vectors
def _MZZ_S : sme2_fp32_quarter_tile_outer_product<0, 0, S, mnemonic, ZPR32Mul2_Lo, ZPR32Mul2_Hi>, SMEPseudo2Instr<NAME # _MZZ_S, 1>;
- def NAME # _MZZ_S # _PSEUDO : sme2_quarter_tile_outer_product_pseudo<ZPR16Mul2_Lo, ZPR16Mul2_Hi, SMEMatrixTileS>, SMEPseudo2Instr<NAME # _MZZ_S, 0>;
+ def NAME # _MZZ_S # _PSEUDO : sme2_quarter_tile_outer_product_pseudo_single_single<ZPR16Mul2_Lo, ZPR16Mul2_Hi, SMEMatrixTileS>, SMEPseudo2Instr<NAME # _MZZ_S, 0>;
- def : SME2_ZA_Tile_TwoVec_Pat<NAME # _MZZ_S, op, timm32_0_3, nxv4f32>;
+ def : SME2_ZA_Tile_TwoVec_Pat<NAME # _MZZ_S, !cast<SDPatternOperator>(op # "_1x1"), timm32_0_3, nxv4f32>;
// Multiple and single vectors
def _M2ZZ_S : sme2_fp32_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_s_mul_r_Lo, ZPR32Mul2_Hi>;
@@ -5806,13 +5806,13 @@ class sme2_fp64_quarter_tile_outer_product<bit M, bit N, bit S, string mnemonic,
let Constraints = "$ZAda = $_ZAda";
}
-multiclass sme2_fmop4as_fp64_non_widening<bit S, string mnemonic, SDPatternOperator op> {
+multiclass sme2_fmop4as_fp64_non_widening<bit S, string mnemonic, string op> {
// Single vectors
def _MZZ_D : sme2_fp64_quarter_tile_outer_product<0, 0, S, mnemonic, ZPR64Mul2_Lo, ZPR64Mul2_Hi>, SMEPseudo2Instr<NAME # _MZZ_D, 1>;
- def NAME # _MZZ_D # _PSEUDO : sme2_quarter_tile_outer_product_pseudo<ZPR64Mul2_Lo, ZPR64Mul2_Hi, SMEMatrixTileD>, SMEPseudo2Instr<NAME # _MZZ_D, 0>;
+ def NAME # _MZZ_D # _PSEUDO : sme2_quarter_tile_outer_product_pseudo_single_single<ZPR64Mul2_Lo, ZPR64Mul2_Hi, SMEMatrixTileD>, SMEPseudo2Instr<NAME # _MZZ_D, 0>;
- def : SME2_ZA_Tile_TwoVec_Pat<NAME # _MZZ_D, op, timm32_0_3, nxv2f64>;
+ def : SME2_ZA_Tile_TwoVec_Pat<NAME # _MZZ_D, !cast<SDPatternOperator>(op # "_1x1"), timm32_0_3, nxv2f64>;
// Multiple and single vectors
def _M2ZZ_D : sme2_fp64_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_d_mul_r_Lo, ZPR64Mul2_Hi>;
@@ -5847,13 +5847,13 @@ class sme2_fp16_fp32_quarter_tile_outer_product<bit M, bit N, bit S, string mnem
let Constraints = "$ZAda = $_ZAda";
}
-multiclass sme2_fmop4as_fp16_fp32_widening<bit S, string mnemonic, SDPatternOperator op> {
+multiclass sme2_fmop4as_fp16_fp32_widening<bit S, string mnemonic, string op> {
// Single vectors
def _MZZ_HtoS : sme2_fp16_fp32_quarter_tile_outer_product<0, 0, S, mnemonic, ZPR16Mul2_Lo, ZPR16Mul2_Hi>, SMEPseudo2Instr<NAME # _MZZ_HtoS, 1>;
- def NAME # _MZZ_HtoS # _PSEUDO : sme2_quarter_tile_outer_product_pseudo<ZPR16Mul2_Lo, ZPR16Mul2_Hi, SMEMatrixTileS>, SMEPseudo2Instr<NAME # _MZZ_HtoS, 0>;
+ def NAME # _MZZ_HtoS # _PSEUDO : sme2_quarter_tile_outer_product_pseudo_single_single<ZPR16Mul2_Lo, ZPR16Mul2_Hi, SMEMatrixTileS>, SMEPseudo2Instr<NAME # _MZZ_HtoS, 0>;
- def : SME2_ZA_Tile_TwoVec_Pat<NAME # _MZZ_HtoS, op, timm32_0_3, nxv8f16>;
+ def : SME2_ZA_Tile_TwoVec_Pat<NAME # _MZZ_HtoS, !cast<SDPatternOperator>(op # "_1x1"), timm32_0_3, nxv8f16>;
// Multiple and single vectors
def _M2ZZ_HtoS : sme2_fp16_fp32_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZPR16Mul2_Hi>;
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_1x1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_1x1.ll
index df985675f3070..730bdb2d720d2 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_1x1.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_1x1.ll
@@ -10,7 +10,7 @@ define void @mop4a_za32_s8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) #0 {
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: smop4a za1.s, z0.b, z24.b
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv16i8(i32 1, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
+ call void @llvm.aarch64.sme.smop4a.wide.1x1.nxv16i8(i32 1, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
ret void
}
@@ -20,7 +20,7 @@ define void @mop4s_za32_s8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) #0 {
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: smop4s za1.s, z0.b, z24.b
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv16i8(i32 1, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
+ call void @llvm.aarch64.sme.smop4s.wide.1x1.nxv16i8(i32 1, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
ret void
}
@@ -28,9 +28,9 @@ define void @mop4a_za32_u8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) #0 {
; CHECK-LABEL: mop4a_za32_u8:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: smop4a za1.s, z0.b, z24.b
+; CHECK-NEXT: umop4a za1.s, z0.b, z24.b
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv16i8(i32 1, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
+ call void @llvm.aarch64.sme.umop4a.wide.1x1.nxv16i8(i32 1, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
ret void
}
@@ -38,9 +38,9 @@ define void @mop4s_za32_u8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) #0 {
; CHECK-LABEL: mop4s_za32_u8:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: smop4s za1.s, z0.b, z24.b
+; CHECK-NEXT: umop4s za1.s, z0.b, z24.b
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv16i8(i32 1, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
+ call void @llvm.aarch64.sme.umop4s.wide.1x1.nxv16i8(i32 1, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
ret void
}
@@ -50,7 +50,7 @@ define void @mop4a_za32_s16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: smop4a za1.s, z0.h, z24.h
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ call void @llvm.aarch64.sme.smop4a.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
ret void
}
@@ -60,7 +60,7 @@ define void @mop4s_za32_s16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: smop4s za1.s, z0.h, z24.h
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ call void @llvm.aarch64.sme.smop4s.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
ret void
}
@@ -68,9 +68,9 @@ define void @mop4a_za32_u16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
; CHECK-LABEL: mop4a_za32_u16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: smop4a za1.s, z0.h, z24.h
+; CHECK-NEXT: umop4a za1.s, z0.h, z24.h
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ call void @llvm.aarch64.sme.umop4a.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
ret void
}
@@ -78,9 +78,9 @@ define void @mop4s_za32_u16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
; CHECK-LABEL: mop4s_za32_u16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: smop4s za1.s, z0.h, z24.h
+; CHECK-NEXT: umop4s za1.s, z0.h, z24.h
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ call void @llvm.aarch64.sme.umop4s.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
ret void
}
@@ -130,7 +130,7 @@ define void @mop4a_za64_s16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: smop4a za1.s, z0.h, z24.h
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ call void @llvm.aarch64.sme.smop4a.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
ret void
}
@@ -140,7 +140,7 @@ define void @mop4s_za64_s16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: smop4s za1.s, z0.h, z24.h
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ call void @llvm.aarch64.sme.smop4s.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
ret void
}
@@ -148,9 +148,9 @@ define void @mop4a_za64_u16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
; CHECK-LABEL: mop4a_za64_u16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: smop4a za1.s, z0.h, z24.h
+; CHECK-NEXT: umop4a za1.s, z0.h, z24.h
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ call void @llvm.aarch64.sme.umop4a.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
ret void
}
@@ -158,9 +158,9 @@ define void @mop4s_za64_u16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
; CHECK-LABEL: mop4s_za64_u16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: smop4s za1.s, z0.h, z24.h
+; CHECK-NEXT: umop4s za1.s, z0.h, z24.h
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ call void @llvm.aarch64.sme.umop4s.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
ret void
}
>From 88e9c0e2be48c6a013dd110ce87b3531986b7203 Mon Sep 17 00:00:00 2001
From: Virginia Cangelosi <virginia.cangelosi at arm.com>
Date: Tue, 25 Feb 2025 13:47:45 +0000
Subject: [PATCH 04/11] Simplify clang multiclasses
---
clang/include/clang/Basic/arm_sme.td | 79 ++++++++++---------
.../AArch64/sme2-intrinsics-mop4a_1x1.ll | 40 ++++++++++
2 files changed, 80 insertions(+), 39 deletions(-)
diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index bf76e4ce53182..123a1c7a007ba 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -376,15 +376,20 @@ let SMETargetGuard = "sme2" in {
// Outer product and accumulate/subtract
//
-multiclass MOP4SingleSingle<string name, string n, string t, string i, string wide> {
- def NAME : Inst<"svmop4" # name # "_1x1_" # n # "[_{d}_{d}]", "vidd", t, MergeNone, i # wide # "_1x1", [IsInOutZA, IsStreaming], [ImmCheck<0, ImmCheck0_3>]>;
+multiclass MOP4<string name, string n, string t, string i, string wide> {
+ def NAME # "_1x1" : Inst<"svmop4" # name # "_1x1_" # n # "[_{d}_{d}]", "vidd", t, MergeNone, i # wide # "_1x1", [IsInOutZA, IsStreaming], [ImmCheck<0, ImmCheck0_3>]>;
}
-multiclass MOP4MixedSignsSingleSingle<string n_suffix1, string n_suffix2, string za, string t> {
- def NAME : SInst<"sv" # n_suffix2 # "_1x1_" # za # "[_{2}_{3}]",
- "vid" # !cond(!eq(n_suffix1, "su") : "u", true: "x"),
- !cond(!eq(n_suffix1, "su") : "", true: "U") # t,
- MergeNone, "aarch64_sme_" # n_suffix1 # n_suffix2 # "_wide_1x1",
+multiclass SUMOP4<string s, string za, string t> {
+ def _1x1 : SInst<"svmop4" # s # "[_1x1_]" # za # "[_{2}_{3}]",
+ "vidu", t, MergeNone, "aarch64_sme_sumop4" # s # "_wide_1x1",
+ [IsStreaming, IsInOutZA],
+ [ImmCheck<0, ImmCheck0_3>]>;
+}
+
+multiclass USMOP4<string s, string za, string t> {
+ def _1x1 : SInst<"svmop4" # s # "[_1x1_]" # za # "[_{2}_{3}]",
+ "vidx", t, MergeNone, "aarch64_sme_usmop4" # s # "_wide_1x1",
[IsStreaming, IsInOutZA],
[ImmCheck<0, ImmCheck0_3>]>;
}
@@ -400,28 +405,24 @@ let SMETargetGuard = "sme2" in {
def SVBMOPS : Inst<"svbmops_za32[_{d}]_m", "viPPdd", "iUi", MergeNone, "aarch64_sme_bmops_za32", [IsInOutZA, IsStreaming], [ImmCheck<0, ImmCheck0_3>]>;
- defm SVSMOP4A_MZZ_HtoS : MOP4SingleSingle<"a", "za32", "s", "aarch64_sme_smop4a", "_wide">;
- defm SVSMOP4S_MZZ_HtoS : MOP4SingleSingle<"s", "za32", "s", "aarch64_sme_smop4s", "_wide">;
- defm SVSMOP4A_MZZ_BToS : MOP4SingleSingle<"a", "za32", "c", "aarch64_sme_smop4a", "_wide">;
- defm SVSMOP4S_MZZ_BToS : MOP4SingleSingle<"s", "za32", "c", "aarch64_sme_smop4s", "_wide">;
+ defm SVSMOP4A_H : MOP4<"a", "za32", "cs", "aarch64_sme_smop4a", "_wide">;
+ defm SVSMOP4S_H : MOP4<"s", "za32", "cs", "aarch64_sme_smop4s", "_wide">;
- defm SVUMOP4A_MZZ_HtoS : MOP4SingleSingle<"a", "za32", "Us", "aarch64_sme_umop4a", "_wide">;
- defm SVUMOP4S_MZZ_HtoS : MOP4SingleSingle<"s", "za32", "Us", "aarch64_sme_umop4s", "_wide">;
- defm SVUMOP4A_MZZ_BToS : MOP4SingleSingle<"a", "za32", "Uc", "aarch64_sme_umop4a", "_wide">;
- defm SVUMOP4S_MZZ_BToS : MOP4SingleSingle<"s", "za32", "Uc", "aarch64_sme_umop4s", "_wide">;
+ defm SVUMOP4A_H : MOP4<"a", "za32", "UcUs", "aarch64_sme_umop4a", "_wide">;
+ defm SVUMOP4S_H : MOP4<"s", "za32", "UcUs", "aarch64_sme_umop4s", "_wide">;
- defm SVFMOP4A_MZZ_HtoS : MOP4SingleSingle<"a", "za32", "h", "aarch64_sme_mop4a", "_wide">;
- defm SVFMOP4S_MZZ_HtoS : MOP4SingleSingle<"s", "za32", "h", "aarch64_sme_mop4s", "_wide">;
- defm SVFMOP4A_MZZ_S : MOP4SingleSingle<"a", "za32", "f", "aarch64_sme_mop4a", "">;
- defm SVFMOP4S_MZZ_S : MOP4SingleSingle<"s", "za32", "f", "aarch64_sme_mop4s", "">;
+ defm SVFMOP4A_HtoS : MOP4<"a", "za32", "h", "aarch64_sme_mop4a", "_wide">;
+ defm SVFMOP4S_HtoS : MOP4<"s", "za32", "h", "aarch64_sme_mop4s", "_wide">;
+ defm SVFMOP4A_S : MOP4<"a", "za32", "f", "aarch64_sme_mop4a", "">;
+ defm SVFMOP4S_S : MOP4<"s", "za32", "f", "aarch64_sme_mop4s", "">;
- defm SVBMOP4A_MZZ_S : MOP4SingleSingle<"a", "za32", "b", "aarch64_sme_mop4a", "_wide">;
- defm SVBMOP4S_MZZ_S : MOP4SingleSingle<"s", "za32", "b", "aarch64_sme_mop4s", "_wide">;
+ defm SVBMOP4A_S : MOP4<"a", "za32", "b", "aarch64_sme_mop4a", "_wide">;
+ defm SVBMOP4S_S : MOP4<"s", "za32", "b", "aarch64_sme_mop4s", "_wide">;
- defm SVSUMOP4A_MZZ_BtoS : MOP4MixedSignsSingleSingle<"su", "mop4a", "za32", "c">;
- defm SVUSMOP4A_MZZ_BtoS : MOP4MixedSignsSingleSingle<"us", "mop4a", "za32", "c">;
- defm SVSUMOP4S_MZZ_BtoS : MOP4MixedSignsSingleSingle<"su", "mop4s", "za32", "c">;
- defm SVUSMOP4S_MZZ_BtoS : MOP4MixedSignsSingleSingle<"us", "mop4s", "za32", "c">;
+ defm SVSUMOP4A_S : SUMOP4<"a", "za32", "cs">;
+ defm SVSUMOP4S_S : SUMOP4<"s", "za32", "cs">;
+ defm SVUSMOP4A_S : USMOP4<"a", "za32", "UcUs">;
+ defm SVUSMOP4S_S : USMOP4<"s", "za32", "UcUs">;
// VERTICAL DOT-PRODUCT
def SVVDOT_LANE_ZA32_VG1x2_S : Inst<"svvdot_lane_za32[_{d}]_vg1x2", "vm2di", "s", MergeNone, "aarch64_sme_svdot_lane_za32_vg1x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_3>]>;
@@ -473,14 +474,14 @@ let SMETargetGuard = "sme2" in {
}
let SMETargetGuard = "sme2,sme-i16i64" in {
- defm SVSMOP4A_MZZ_HtoD : MOP4SingleSingle<"a", "za64", "s", "aarch64_sme_smop4a", "_wide">;
- defm SVSMOP4S_MZZ_HtoD : MOP4SingleSingle<"s", "za64", "s", "aarch64_sme_smop4s", "_wide">;
- defm SVUMOP4A_MZZ_HtoD : MOP4SingleSingle<"a", "za64", "Us", "aarch64_sme_umop4a", "_wide">;
- defm SVUMOP4S_MZZ_HtoD : MOP4SingleSingle<"s", "za64", "Us", "aarch64_sme_umop4s", "_wide">;
- defm SVSUMOP4A_MZZ_HtoD : MOP4MixedSignsSingleSingle<"su", "mop4a", "za64", "s">;
- defm SVUSMOP4A_MZZ_HtoD : MOP4MixedSignsSingleSingle<"us", "mop4a", "za64", "s">;
- defm SVSUMOP4S_MZZ_HtoD : MOP4MixedSignsSingleSingle<"su", "mop4s", "za64", "s">;
- defm SVUSMOP4S_MZZ_HtoD : MOP4MixedSignsSingleSingle<"us", "mop4s", "za64", "s">;
+ defm SVSMOP4A_HtoD : MOP4<"a", "za64", "s", "aarch64_sme_smop4a", "_wide">;
+ defm SVSMOP4S_HtoD : MOP4<"s", "za64", "s", "aarch64_sme_smop4s", "_wide">;
+ defm SVUMOP4A_HtoD : MOP4<"a", "za64", "Us", "aarch64_sme_umop4a", "_wide">;
+ defm SVUMOP4S_HtoD : MOP4<"s", "za64", "Us", "aarch64_sme_umop4s", "_wide">;
+ defm SVSUMOP4A_D : SUMOP4<"a", "za64", "s">;
+ defm SVSUMOP4S_D : SUMOP4<"s", "za64", "s">;
+ defm SVUSMOP4A_D : USMOP4<"a", "za64", "Us">;
+ defm SVUSMOP4S_D : USMOP4<"s", "za64", "Us">;
def SVVDOT_LANE_ZA64_VG1x4_S : Inst<"svvdot_lane_za64[_{d}]_vg1x4", "vm4di", "s", MergeNone, "aarch64_sme_svdot_lane_za64_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_1>]>;
def SVVDOT_LANE_ZA64_VG1x4_U : Inst<"svvdot_lane_za64[_{d}]_vg1x4", "vm4di", "Us", MergeNone, "aarch64_sme_uvdot_lane_za64_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_1>]>;
@@ -518,8 +519,8 @@ let SMETargetGuard = "sme2" in {
}
let SMETargetGuard = "sme2,sme-f64f64" in {
- defm SVFMOP4A_MZZ_D : MOP4SingleSingle<"a", "za64", "d", "aarch64_sme_mop4a", "">;
- defm SVFMOP4S_MZZ_D : MOP4SingleSingle<"s", "za64", "d", "aarch64_sme_mop4s", "">;
+ defm SVFMOP4A_D : MOP4<"a", "za64", "d", "aarch64_sme_mop4a", "">;
+ defm SVFMOP4S_D : MOP4<"s", "za64", "d", "aarch64_sme_mop4s", "">;
def SVMLA_MULTI_VG1x2_F64 : Inst<"svmla_za64[_{d}]_vg1x2", "vm22", "d", MergeNone, "aarch64_sme_fmla_vg1x2", [IsStreaming, IsInOutZA], []>;
def SVMLA_MULTI_VG1x4_F64 : Inst<"svmla_za64[_{d}]_vg1x4", "vm44", "d", MergeNone, "aarch64_sme_fmla_vg1x4", [IsStreaming, IsInOutZA], []>;
@@ -538,8 +539,8 @@ let SMETargetGuard = "sme2,sme-f64f64" in {
}
let SMETargetGuard = "sme-f16f16" in {
- defm SVFMOP4A_MZZ_H : MOP4SingleSingle<"a", "za16", "h", "aarch64_sme_mop4a", "">;
- defm SVFMOP4S_MZZ_H : MOP4SingleSingle<"s", "za16", "h", "aarch64_sme_mop4s", "">;
+ defm SVFMOP4A_H : MOP4<"a", "za16", "h", "aarch64_sme_mop4a", "">;
+ defm SVFMOP4S_H : MOP4<"s", "za16", "h", "aarch64_sme_mop4s", "">;
def SVMLA_MULTI_VG1x2_F16 : Inst<"svmla_za16[_f16]_vg1x2", "vm22", "h", MergeNone, "aarch64_sme_fmla_vg1x2", [IsStreaming, IsInOutZA], []>;
def SVMLA_MULTI_VG1x4_F16 : Inst<"svmla_za16[_f16]_vg1x4", "vm44", "h", MergeNone, "aarch64_sme_fmla_vg1x4", [IsStreaming, IsInOutZA], []>;
@@ -558,8 +559,8 @@ let SMETargetGuard = "sme-f16f16" in {
}
let SMETargetGuard = "sme-b16b16" in {
- defm SVBMOP4A_MZZ_H : MOP4SingleSingle<"a", "za16", "bf", "aarch64_sme_mop4a", "">;
- defm SVBMOP4S_MZZ_H : MOP4SingleSingle<"s", "za16", "bf", "aarch64_sme_mop4s", "">;
+ defm SVBMOP4A_H : MOP4<"a", "za16", "bf", "aarch64_sme_mop4a", "">;
+ defm SVBMOP4S_H : MOP4<"s", "za16", "bf", "aarch64_sme_mop4s", "">;
def SVMLA_MULTI_VG1x2_BF16 : Inst<"svmla_za16[_bf16]_vg1x2", "vm22", "b", MergeNone, "aarch64_sme_fmla_vg1x2", [IsStreaming, IsInOutZA], []>;
def SVMLA_MULTI_VG1x4_BF16 : Inst<"svmla_za16[_bf16]_vg1x4", "vm44", "b", MergeNone, "aarch64_sme_fmla_vg1x4", [IsStreaming, IsInOutZA], []>;
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_1x1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_1x1.ll
index 730bdb2d720d2..938c57ae89200 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_1x1.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_1x1.ll
@@ -44,6 +44,46 @@ define void @mop4s_za32_u8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) #0 {
ret void
}
+define void @mop4a_za32_s8_u8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) #0 {
+; CHECK-LABEL: mop4a_za32_s8_u8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: sumop4a za1.s, z0.b, z24.b
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.sumop4a.wide.1x1.nxv16i8(i32 1, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
+ ret void
+}
+
+define void @mop4s_za32_s8_u8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) #0 {
+; CHECK-LABEL: mop4s_za32_s8_u8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: sumop4s za1.s, z0.b, z24.b
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.sumop4s.wide.1x1.nxv16i8(i32 1, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
+ ret void
+}
+
+define void @mop4a_za32_u8_s8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) #0 {
+; CHECK-LABEL: mop4a_za32_u8_s8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: usmop4a za1.s, z0.b, z24.b
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.usmop4a.wide.1x1.nxv16i8(i32 1, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
+ ret void
+}
+
+define void @mop4s_za32_u8_s8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) #0 {
+; CHECK-LABEL: mop4s_za32_u8_s8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: usmop4s za1.s, z0.b, z24.b
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.usmop4s.wide.1x1.nxv16i8(i32 1, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
+ ret void
+}
+
define void @mop4a_za32_s16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
; CHECK-LABEL: mop4a_za32_s16:
; CHECK: // %bb.0:
>From d29ca213e2cea26e06cc43e2d6695e3bff1e273b Mon Sep 17 00:00:00 2001
From: Virginia Cangelosi <virginia.cangelosi at arm.com>
Date: Wed, 26 Feb 2025 09:24:38 +0000
Subject: [PATCH 05/11] Add instrinsics to differentiate za64 and za32
---
clang/include/clang/Basic/arm_sme.td | 32 ++--
.../sme2-intrinsics/acle_sme2_mop4_1x1.c | 148 +++++++++---------
llvm/include/llvm/IR/IntrinsicsAArch64.td | 8 +
.../lib/Target/AArch64/AArch64SMEInstrInfo.td | 16 +-
.../AArch64/sme2-intrinsics-mop4a_1x1.ll | 40 +++++
5 files changed, 146 insertions(+), 98 deletions(-)
diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index 123a1c7a007ba..7cf1ae427b072 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -380,16 +380,16 @@ multiclass MOP4<string name, string n, string t, string i, string wide> {
def NAME # "_1x1" : Inst<"svmop4" # name # "_1x1_" # n # "[_{d}_{d}]", "vidd", t, MergeNone, i # wide # "_1x1", [IsInOutZA, IsStreaming], [ImmCheck<0, ImmCheck0_3>]>;
}
-multiclass SUMOP4<string s, string za, string t> {
+multiclass SUMOP4<string s, string za, string t, string i> {
def _1x1 : SInst<"svmop4" # s # "[_1x1_]" # za # "[_{2}_{3}]",
- "vidu", t, MergeNone, "aarch64_sme_sumop4" # s # "_wide_1x1",
+ "vidu", t, MergeNone, "aarch64_sme_sumop4" # s # i # "_wide_1x1",
[IsStreaming, IsInOutZA],
[ImmCheck<0, ImmCheck0_3>]>;
}
-multiclass USMOP4<string s, string za, string t> {
+multiclass USMOP4<string s, string za, string t, string i> {
def _1x1 : SInst<"svmop4" # s # "[_1x1_]" # za # "[_{2}_{3}]",
- "vidx", t, MergeNone, "aarch64_sme_usmop4" # s # "_wide_1x1",
+ "vidx", t, MergeNone, "aarch64_sme_usmop4" # s # i # "_wide_1x1",
[IsStreaming, IsInOutZA],
[ImmCheck<0, ImmCheck0_3>]>;
}
@@ -419,10 +419,10 @@ let SMETargetGuard = "sme2" in {
defm SVBMOP4A_S : MOP4<"a", "za32", "b", "aarch64_sme_mop4a", "_wide">;
defm SVBMOP4S_S : MOP4<"s", "za32", "b", "aarch64_sme_mop4s", "_wide">;
- defm SVSUMOP4A_S : SUMOP4<"a", "za32", "cs">;
- defm SVSUMOP4S_S : SUMOP4<"s", "za32", "cs">;
- defm SVUSMOP4A_S : USMOP4<"a", "za32", "UcUs">;
- defm SVUSMOP4S_S : USMOP4<"s", "za32", "UcUs">;
+ defm SVSUMOP4A_S : SUMOP4<"a", "za32", "cs", "">;
+ defm SVSUMOP4S_S : SUMOP4<"s", "za32", "cs", "">;
+ defm SVUSMOP4A_S : USMOP4<"a", "za32", "UcUs", "">;
+ defm SVUSMOP4S_S : USMOP4<"s", "za32", "UcUs", "">;
// VERTICAL DOT-PRODUCT
def SVVDOT_LANE_ZA32_VG1x2_S : Inst<"svvdot_lane_za32[_{d}]_vg1x2", "vm2di", "s", MergeNone, "aarch64_sme_svdot_lane_za32_vg1x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_3>]>;
@@ -474,14 +474,14 @@ let SMETargetGuard = "sme2" in {
}
let SMETargetGuard = "sme2,sme-i16i64" in {
- defm SVSMOP4A_HtoD : MOP4<"a", "za64", "s", "aarch64_sme_smop4a", "_wide">;
- defm SVSMOP4S_HtoD : MOP4<"s", "za64", "s", "aarch64_sme_smop4s", "_wide">;
- defm SVUMOP4A_HtoD : MOP4<"a", "za64", "Us", "aarch64_sme_umop4a", "_wide">;
- defm SVUMOP4S_HtoD : MOP4<"s", "za64", "Us", "aarch64_sme_umop4s", "_wide">;
- defm SVSUMOP4A_D : SUMOP4<"a", "za64", "s">;
- defm SVSUMOP4S_D : SUMOP4<"s", "za64", "s">;
- defm SVUSMOP4A_D : USMOP4<"a", "za64", "Us">;
- defm SVUSMOP4S_D : USMOP4<"s", "za64", "Us">;
+ defm SVSMOP4A_HtoD : MOP4<"a", "za64", "s", "aarch64_sme_smop4a_za64", "_wide">;
+ defm SVSMOP4S_HtoD : MOP4<"s", "za64", "s", "aarch64_sme_smop4s_za64", "_wide">;
+ defm SVUMOP4A_HtoD : MOP4<"a", "za64", "Us", "aarch64_sme_umop4a_za64", "_wide">;
+ defm SVUMOP4S_HtoD : MOP4<"s", "za64", "Us", "aarch64_sme_umop4s_za64", "_wide">;
+ defm SVSUMOP4A_D : SUMOP4<"a", "za64", "s", "_za64">;
+ defm SVSUMOP4S_D : SUMOP4<"s", "za64", "s", "_za64">;
+ defm SVUSMOP4A_D : USMOP4<"a", "za64", "Us", "_za64">;
+ defm SVUSMOP4S_D : USMOP4<"s", "za64", "Us", "_za64">;
def SVVDOT_LANE_ZA64_VG1x4_S : Inst<"svvdot_lane_za64[_{d}]_vg1x4", "vm4di", "s", MergeNone, "aarch64_sme_svdot_lane_za64_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_1>]>;
def SVVDOT_LANE_ZA64_VG1x4_U : Inst<"svvdot_lane_za64[_{d}]_vg1x4", "vm4di", "Us", MergeNone, "aarch64_sme_uvdot_lane_za64_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_1>]>;
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_1x1.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_1x1.c
index eac3648f90368..34a9633374d3f 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_1x1.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_1x1.c
@@ -72,6 +72,62 @@ void test_svmop4s_1x1_za32_u8_u8(svuint8_t zn, svuint8_t zm) __arm_streaming __a
SME_ACLE_FUNC(svmop4s_1x1_za32,_u8_u8,)(3, zn, zm);
}
+// CHECK-LABEL: @test_svmop4a_1x1_za32_s8_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4a.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z27test_svmop4a_1x1_za32_s8_u8u10__SVInt8_tu11__SVUint8_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4a.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_1x1_za32_s8_u8(svint8_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_1x1_za32,_s8_u8,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_1x1_za32_s8_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4s.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z27test_svmop4s_1x1_za32_s8_u8u10__SVInt8_tu11__SVUint8_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4s.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4s_1x1_za32_s8_u8(svint8_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s_1x1_za32,_s8_u8,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_1x1_za32_u8_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4a.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z27test_svmop4a_1x1_za32_u8_s8u11__SVUint8_tu10__SVInt8_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4a.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_1x1_za32_u8_s8(svuint8_t zn, svint8_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_1x1_za32,_u8_s8,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_1x1_za32_u8_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4s.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z27test_svmop4s_1x1_za32_u8_s8u11__SVUint8_tu10__SVInt8_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4s.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4s_1x1_za32_u8_s8(svuint8_t zn, svint8_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s_1x1_za32,_u8_s8,)(3, zn, zm);
+}
+
// CHECK-LABEL: @test_svmop4a_1x1_za32_s16_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
@@ -186,40 +242,40 @@ void test_svmop4s_1x1_za32_bf16_bf16(svbfloat16_t zn, svbfloat16_t zm) __arm_str
// CHECK-LABEL: @test_svmop4a_1x1_za64_s16_s16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.za64.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x1_za64_s16_s16u11__SVInt16_tS_(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.za64.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_1x1_za64_s16_s16(svint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4a_1x1_za32,_s16_s16,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4a_1x1_za64,_s16_s16,)(3, zn, zm);
}
// CHECK-LABEL: @test_svmop4s_1x1_za64_s16_s16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.za64.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x1_za64_s16_s16u11__SVInt16_tS_(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.za64.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4s_1x1_za64_s16_s16(svint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4s_1x1_za32,_s16_s16,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4s_1x1_za64,_s16_s16,)(3, zn, zm);
}
// CHECK-LABEL: @test_svmop4a_1x1_za64_u16_u16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.za64.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x1_za64_u16_u16u12__SVUint16_tS_(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.za64.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_1x1_za64_u16_u16(svuint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") {
@@ -228,12 +284,12 @@ void test_svmop4a_1x1_za64_u16_u16(svuint16_t zn, svuint16_t zm) __arm_streaming
// CHECK-LABEL: @test_svmop4s_1x1_za64_u16_u16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.za64.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x1_za64_u16_u16u12__SVUint16_tS_(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.za64.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4s_1x1_za64_u16_u16(svuint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") {
@@ -242,12 +298,12 @@ void test_svmop4s_1x1_za64_u16_u16(svuint16_t zn, svuint16_t zm) __arm_streaming
// CHECK-LABEL: @test_svmop4a_1x1_za64_s16_u16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4a.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4a.za64.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x1_za64_s16_u16u11__SVInt16_tu12__SVUint16_t(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4a.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4a.za64.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_1x1_za64_s16_u16(svint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") {
@@ -256,12 +312,12 @@ void test_svmop4a_1x1_za64_s16_u16(svint16_t zn, svuint16_t zm) __arm_streaming
// CHECK-LABEL: @test_svmop4s_1x1_za64_s16_u16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4s.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4s.za64.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x1_za64_s16_u16u11__SVInt16_tu12__SVUint16_t(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4s.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4s.za64.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4s_1x1_za64_s16_u16(svint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") {
@@ -270,12 +326,12 @@ void test_svmop4s_1x1_za64_s16_u16(svint16_t zn, svuint16_t zm) __arm_streaming
// CHECK-LABEL: @test_svmop4a_1x1_za64_u16_s16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4a.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4a.za64.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x1_za64_u16_s16u12__SVUint16_tu11__SVInt16_t(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4a.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4a.za64.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_1x1_za64_u16_s16(svuint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za") {
@@ -284,74 +340,18 @@ void test_svmop4a_1x1_za64_u16_s16(svuint16_t zn, svint16_t zm) __arm_streaming
// CHECK-LABEL: @test_svmop4s_1x1_za64_u16_s16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4s.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4s.za64.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x1_za64_u16_s16u12__SVUint16_tu11__SVInt16_t(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4s.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4s.za64.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4s_1x1_za64_u16_s16(svuint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za") {
SME_ACLE_FUNC(svmop4s_1x1_za64,_u16_s16,)(3, zn, zm);
}
-// CHECK-LABEL: @test_svmop4a_1x1_za32_s8_u8(
-// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4a.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
-// CHECK-NEXT: ret void
-//
-// CPP-CHECK-LABEL: @_Z27test_svmop4a_1x1_za32_s8_u8u10__SVInt8_tu11__SVUint8_t(
-// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4a.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
-// CPP-CHECK-NEXT: ret void
-//
-void test_svmop4a_1x1_za32_s8_u8(svint8_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4a_1x1_za32,_s8_u8,)(3, zn, zm);
-}
-
-// CHECK-LABEL: @test_svmop4s_1x1_za32_s8_u8(
-// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4s.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
-// CHECK-NEXT: ret void
-//
-// CPP-CHECK-LABEL: @_Z27test_svmop4s_1x1_za32_s8_u8u10__SVInt8_tu11__SVUint8_t(
-// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4s.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
-// CPP-CHECK-NEXT: ret void
-//
-void test_svmop4s_1x1_za32_s8_u8(svint8_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4s_1x1_za32,_s8_u8,)(3, zn, zm);
-}
-
-// CHECK-LABEL: @test_svmop4a_1x1_za32_u8_s8(
-// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4a.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
-// CHECK-NEXT: ret void
-//
-// CPP-CHECK-LABEL: @_Z27test_svmop4a_1x1_za32_u8_s8u11__SVUint8_tu10__SVInt8_t(
-// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4a.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
-// CPP-CHECK-NEXT: ret void
-//
-void test_svmop4a_1x1_za32_u8_s8(svuint8_t zn, svint8_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4a_1x1_za32,_u8_s8,)(3, zn, zm);
-}
-
-// CHECK-LABEL: @test_svmop4s_1x1_za32_u8_s8(
-// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4s.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
-// CHECK-NEXT: ret void
-//
-// CPP-CHECK-LABEL: @_Z27test_svmop4s_1x1_za32_u8_s8u11__SVUint8_tu10__SVInt8_t(
-// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4s.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
-// CPP-CHECK-NEXT: ret void
-//
-void test_svmop4s_1x1_za32_u8_s8(svuint8_t zn, svint8_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4s_1x1_za32,_u8_s8,)(3, zn, zm);
-}
-
// CHECK-LABEL: @test_svmop4a_1x1_za16_f16_f16(
// CHECK-NEXT: entry:
// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x1.nxv8f16(i32 3, <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]])
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 8556ae83c79f9..eeea8d77d1e16 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -3090,6 +3090,14 @@ let TargetPrefix = "aarch64" in {
def int_aarch64_sme_usmop4s_wide_1x1 : SME_OuterProduct_QuaterTile_Single;
def int_aarch64_sme_usmop4a_1x1 : SME_OuterProduct_QuaterTile_Single;
def int_aarch64_sme_usmop4s_1x1 : SME_OuterProduct_QuaterTile_Single;
+ def int_aarch64_sme_smop4a_za64_wide_1x1 : SME_OuterProduct_QuaterTile_Single;
+ def int_aarch64_sme_smop4s_za64_wide_1x1 : SME_OuterProduct_QuaterTile_Single;
+ def int_aarch64_sme_umop4a_za64_wide_1x1 : SME_OuterProduct_QuaterTile_Single;
+ def int_aarch64_sme_umop4s_za64_wide_1x1 : SME_OuterProduct_QuaterTile_Single;
+ def int_aarch64_sme_sumop4a_za64_wide_1x1 : SME_OuterProduct_QuaterTile_Single;
+ def int_aarch64_sme_sumop4s_za64_wide_1x1 : SME_OuterProduct_QuaterTile_Single;
+ def int_aarch64_sme_usmop4a_za64_wide_1x1 : SME_OuterProduct_QuaterTile_Single;
+ def int_aarch64_sme_usmop4s_za64_wide_1x1 : SME_OuterProduct_QuaterTile_Single;
class SME_AddVectorToTile_Intrinsic
: DefaultAttrsIntrinsic<[],
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index c02e72e8b4c26..f992f73171e0e 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -164,14 +164,14 @@ let Predicates = [HasSME_MOP4] in {
}
let Predicates = [HasSME_MOP4, HasSMEI16I64] in {
- defm SMOP4A : sme_quarter_outer_product_i64<0b0, 0b0, 0b0, "smop4a", "int_aarch64_sme_smop4a_wide">;
- defm SMOP4S : sme_quarter_outer_product_i64<0b0, 0b0, 0b1, "smop4s", "int_aarch64_sme_smop4s_wide">;
- defm SUMOP4A : sme_quarter_outer_product_i64<0b0, 0b1, 0b0, "sumop4a", "int_aarch64_sme_sumop4a_wide">;
- defm SUMOP4S : sme_quarter_outer_product_i64<0b0, 0b1, 0b1, "sumop4s", "int_aarch64_sme_sumop4s_wide">;
- defm UMOP4A : sme_quarter_outer_product_i64<0b1, 0b1, 0b0, "umop4a", "int_aarch64_sme_umop4a_wide">;
- defm UMOP4S : sme_quarter_outer_product_i64<0b1, 0b1, 0b1, "umop4s", "int_aarch64_sme_umop4s_wide">;
- defm USMOP4A : sme_quarter_outer_product_i64<0b1, 0b0, 0b0, "usmop4a", "int_aarch64_sme_usmop4a_wide">;
- defm USMOP4S : sme_quarter_outer_product_i64<0b1, 0b0, 0b1, "usmop4s", "int_aarch64_sme_usmop4s_wide">;
+ defm SMOP4A : sme_quarter_outer_product_i64<0b0, 0b0, 0b0, "smop4a", "int_aarch64_sme_smop4a_za64_wide">;
+ defm SMOP4S : sme_quarter_outer_product_i64<0b0, 0b0, 0b1, "smop4s", "int_aarch64_sme_smop4s_za64_wide">;
+ defm SUMOP4A : sme_quarter_outer_product_i64<0b0, 0b1, 0b0, "sumop4a", "int_aarch64_sme_sumop4a_za64_wide">;
+ defm SUMOP4S : sme_quarter_outer_product_i64<0b0, 0b1, 0b1, "sumop4s", "int_aarch64_sme_sumop4s_za64_wide">;
+ defm UMOP4A : sme_quarter_outer_product_i64<0b1, 0b1, 0b0, "umop4a", "int_aarch64_sme_umop4a_za64_wide">;
+ defm UMOP4S : sme_quarter_outer_product_i64<0b1, 0b1, 0b1, "umop4s", "int_aarch64_sme_umop4s_za64_wide">;
+ defm USMOP4A : sme_quarter_outer_product_i64<0b1, 0b0, 0b0, "usmop4a", "int_aarch64_sme_usmop4a_za64_wide">;
+ defm USMOP4S : sme_quarter_outer_product_i64<0b1, 0b0, 0b1, "usmop4s", "int_aarch64_sme_usmop4s_za64_wide">;
}
let Predicates = [HasSME_TMOP] in {
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_1x1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_1x1.ll
index 938c57ae89200..7bcf407d23297 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_1x1.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_1x1.ll
@@ -204,6 +204,46 @@ define void @mop4s_za64_u16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
ret void
}
+define void @mop4a_za64_s16_u16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
+; CHECK-LABEL: mop4a_za64_s16_u16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: sumop4a za1.d, z0.h, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.sumop4a.za64.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ ret void
+}
+
+define void @mop4s_za64_s16_u16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
+; CHECK-LABEL: mop4s_za64_s16_u16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: sumop4s za1.d, z0.h, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.sumop4s.za64.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ ret void
+}
+
+define void @mop4a_za64_u16_s16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
+; CHECK-LABEL: mop4a_za64_u16_s16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: usmop4a za1.d, z0.h, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.usmop4a.za64.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ ret void
+}
+
+define void @mop4s_za64_u16_s16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
+; CHECK-LABEL: mop4s_za64_u16_s16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: usmop4s za1.d, z0.h, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.usmop4s.za64.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ ret void
+}
+
; Non-widening
define void @mop4a_za16_f16(<vscale x 8 x half> %zn, <vscale x 8 x half> %zm) #0 {
; CHECK-LABEL: mop4a_za16_f16:
>From d69c8ed10a8092fb28871a28a438e02cf9a74fea Mon Sep 17 00:00:00 2001
From: Virginia Cangelosi <virginia.cangelosi at arm.com>
Date: Wed, 26 Feb 2025 14:12:46 +0000
Subject: [PATCH 06/11] Fix immediates and add more tests
---
clang/include/clang/Basic/arm_sme.td | 72 +++----
.../sme2-intrinsics/acle_sme2_mop4_1x1.c | 192 +++++++++---------
llvm/lib/Target/AArch64/SMEInstrFormats.td | 8 +-
.../AArch64/sme2-intrinsics-mop4a_1x1.ll | 154 +++++++++++---
4 files changed, 259 insertions(+), 167 deletions(-)
diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index 7cf1ae427b072..ff42f110f72cb 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -376,22 +376,22 @@ let SMETargetGuard = "sme2" in {
// Outer product and accumulate/subtract
//
-multiclass MOP4<string name, string n, string t, string i, string wide> {
- def NAME # "_1x1" : Inst<"svmop4" # name # "_1x1_" # n # "[_{d}_{d}]", "vidd", t, MergeNone, i # wide # "_1x1", [IsInOutZA, IsStreaming], [ImmCheck<0, ImmCheck0_3>]>;
+multiclass MOP4<string name, string n, string t, string i, string wide, list<ImmCheck> checks> {
+ def NAME # "_1x1" : Inst<"svmop4" # name # "_1x1_" # n # "[_{d}_{d}]", "vidd", t, MergeNone, i # wide # "_1x1", [IsInOutZA, IsStreaming], checks>;
}
-multiclass SUMOP4<string s, string za, string t, string i> {
- def _1x1 : SInst<"svmop4" # s # "[_1x1_]" # za # "[_{2}_{3}]",
+multiclass SUMOP4<string s, string za, string t, string i, list<ImmCheck> checks> {
+ def _1x1 : SInst<"svmop4" # s # "[_1x1_]" # za # "[_{d}_{3}]",
"vidu", t, MergeNone, "aarch64_sme_sumop4" # s # i # "_wide_1x1",
[IsStreaming, IsInOutZA],
- [ImmCheck<0, ImmCheck0_3>]>;
+ checks>;
}
-multiclass USMOP4<string s, string za, string t, string i> {
- def _1x1 : SInst<"svmop4" # s # "[_1x1_]" # za # "[_{2}_{3}]",
+multiclass USMOP4<string s, string za, string t, string i, list<ImmCheck> checks> {
+ def _1x1 : SInst<"svmop4" # s # "[_1x1_]" # za # "[_{d}_{3}]",
"vidx", t, MergeNone, "aarch64_sme_usmop4" # s # i # "_wide_1x1",
[IsStreaming, IsInOutZA],
- [ImmCheck<0, ImmCheck0_3>]>;
+ checks>;
}
let SMETargetGuard = "sme2" in {
@@ -405,24 +405,24 @@ let SMETargetGuard = "sme2" in {
def SVBMOPS : Inst<"svbmops_za32[_{d}]_m", "viPPdd", "iUi", MergeNone, "aarch64_sme_bmops_za32", [IsInOutZA, IsStreaming], [ImmCheck<0, ImmCheck0_3>]>;
- defm SVSMOP4A_H : MOP4<"a", "za32", "cs", "aarch64_sme_smop4a", "_wide">;
- defm SVSMOP4S_H : MOP4<"s", "za32", "cs", "aarch64_sme_smop4s", "_wide">;
+ defm SVSMOP4A_H : MOP4<"a", "za32", "cs", "aarch64_sme_smop4a", "_wide", [ImmCheck<0, ImmCheck0_3>]>;
+ defm SVSMOP4S_H : MOP4<"s", "za32", "cs", "aarch64_sme_smop4s", "_wide", [ImmCheck<0, ImmCheck0_3>]>;
- defm SVUMOP4A_H : MOP4<"a", "za32", "UcUs", "aarch64_sme_umop4a", "_wide">;
- defm SVUMOP4S_H : MOP4<"s", "za32", "UcUs", "aarch64_sme_umop4s", "_wide">;
+ defm SVUMOP4A_H : MOP4<"a", "za32", "UcUs", "aarch64_sme_umop4a", "_wide", [ImmCheck<0, ImmCheck0_3>]>;
+ defm SVUMOP4S_H : MOP4<"s", "za32", "UcUs", "aarch64_sme_umop4s", "_wide", [ImmCheck<0, ImmCheck0_3>]>;
- defm SVFMOP4A_HtoS : MOP4<"a", "za32", "h", "aarch64_sme_mop4a", "_wide">;
- defm SVFMOP4S_HtoS : MOP4<"s", "za32", "h", "aarch64_sme_mop4s", "_wide">;
- defm SVFMOP4A_S : MOP4<"a", "za32", "f", "aarch64_sme_mop4a", "">;
- defm SVFMOP4S_S : MOP4<"s", "za32", "f", "aarch64_sme_mop4s", "">;
+ defm SVFMOP4A_HtoS : MOP4<"a", "za32", "h", "aarch64_sme_mop4a", "_wide", [ImmCheck<0, ImmCheck0_3>]>;
+ defm SVFMOP4S_HtoS : MOP4<"s", "za32", "h", "aarch64_sme_mop4s", "_wide", [ImmCheck<0, ImmCheck0_3>]>;
+ defm SVFMOP4A_S : MOP4<"a", "za32", "f", "aarch64_sme_mop4a", "", [ImmCheck<0, ImmCheck0_3>]>;
+ defm SVFMOP4S_S : MOP4<"s", "za32", "f", "aarch64_sme_mop4s", "", [ImmCheck<0, ImmCheck0_3>]>;
- defm SVBMOP4A_S : MOP4<"a", "za32", "b", "aarch64_sme_mop4a", "_wide">;
- defm SVBMOP4S_S : MOP4<"s", "za32", "b", "aarch64_sme_mop4s", "_wide">;
+ defm SVBMOP4A_S : MOP4<"a", "za32", "b", "aarch64_sme_mop4a", "_wide", [ImmCheck<0, ImmCheck0_3>]>;
+ defm SVBMOP4S_S : MOP4<"s", "za32", "b", "aarch64_sme_mop4s", "_wide", [ImmCheck<0, ImmCheck0_3>]>;
- defm SVSUMOP4A_S : SUMOP4<"a", "za32", "cs", "">;
- defm SVSUMOP4S_S : SUMOP4<"s", "za32", "cs", "">;
- defm SVUSMOP4A_S : USMOP4<"a", "za32", "UcUs", "">;
- defm SVUSMOP4S_S : USMOP4<"s", "za32", "UcUs", "">;
+ defm SVSUMOP4A_S : SUMOP4<"a", "za32", "cs", "", [ImmCheck<0, ImmCheck0_3>]>;
+ defm SVSUMOP4S_S : SUMOP4<"s", "za32", "cs", "", [ImmCheck<0, ImmCheck0_3>]>;
+ defm SVUSMOP4A_S : USMOP4<"a", "za32", "UcUs", "", [ImmCheck<0, ImmCheck0_3>]>;
+ defm SVUSMOP4S_S : USMOP4<"s", "za32", "UcUs", "", [ImmCheck<0, ImmCheck0_3>]>;
// VERTICAL DOT-PRODUCT
def SVVDOT_LANE_ZA32_VG1x2_S : Inst<"svvdot_lane_za32[_{d}]_vg1x2", "vm2di", "s", MergeNone, "aarch64_sme_svdot_lane_za32_vg1x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_3>]>;
@@ -474,14 +474,14 @@ let SMETargetGuard = "sme2" in {
}
let SMETargetGuard = "sme2,sme-i16i64" in {
- defm SVSMOP4A_HtoD : MOP4<"a", "za64", "s", "aarch64_sme_smop4a_za64", "_wide">;
- defm SVSMOP4S_HtoD : MOP4<"s", "za64", "s", "aarch64_sme_smop4s_za64", "_wide">;
- defm SVUMOP4A_HtoD : MOP4<"a", "za64", "Us", "aarch64_sme_umop4a_za64", "_wide">;
- defm SVUMOP4S_HtoD : MOP4<"s", "za64", "Us", "aarch64_sme_umop4s_za64", "_wide">;
- defm SVSUMOP4A_D : SUMOP4<"a", "za64", "s", "_za64">;
- defm SVSUMOP4S_D : SUMOP4<"s", "za64", "s", "_za64">;
- defm SVUSMOP4A_D : USMOP4<"a", "za64", "Us", "_za64">;
- defm SVUSMOP4S_D : USMOP4<"s", "za64", "Us", "_za64">;
+ defm SVSMOP4A_HtoD : MOP4<"a", "za64", "s", "aarch64_sme_smop4a_za64", "_wide", [ImmCheck<0, ImmCheck0_7>]>;
+ defm SVSMOP4S_HtoD : MOP4<"s", "za64", "s", "aarch64_sme_smop4s_za64", "_wide", [ImmCheck<0, ImmCheck0_7>]>;
+ defm SVUMOP4A_HtoD : MOP4<"a", "za64", "Us", "aarch64_sme_umop4a_za64", "_wide", [ImmCheck<0, ImmCheck0_7>]>;
+ defm SVUMOP4S_HtoD : MOP4<"s", "za64", "Us", "aarch64_sme_umop4s_za64", "_wide", [ImmCheck<0, ImmCheck0_7>]>;
+ defm SVSUMOP4A_D : SUMOP4<"a", "za64", "s", "_za64", [ImmCheck<0, ImmCheck0_7>]>;
+ defm SVSUMOP4S_D : SUMOP4<"s", "za64", "s", "_za64", [ImmCheck<0, ImmCheck0_7>]>;
+ defm SVUSMOP4A_D : USMOP4<"a", "za64", "Us", "_za64", [ImmCheck<0, ImmCheck0_7>]>;
+ defm SVUSMOP4S_D : USMOP4<"s", "za64", "Us", "_za64", [ImmCheck<0, ImmCheck0_7>]>;
def SVVDOT_LANE_ZA64_VG1x4_S : Inst<"svvdot_lane_za64[_{d}]_vg1x4", "vm4di", "s", MergeNone, "aarch64_sme_svdot_lane_za64_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_1>]>;
def SVVDOT_LANE_ZA64_VG1x4_U : Inst<"svvdot_lane_za64[_{d}]_vg1x4", "vm4di", "Us", MergeNone, "aarch64_sme_uvdot_lane_za64_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_1>]>;
@@ -519,8 +519,8 @@ let SMETargetGuard = "sme2" in {
}
let SMETargetGuard = "sme2,sme-f64f64" in {
- defm SVFMOP4A_D : MOP4<"a", "za64", "d", "aarch64_sme_mop4a", "">;
- defm SVFMOP4S_D : MOP4<"s", "za64", "d", "aarch64_sme_mop4s", "">;
+ defm SVFMOP4A_D : MOP4<"a", "za64", "d", "aarch64_sme_mop4a", "", [ImmCheck<0, ImmCheck0_7>]>;
+ defm SVFMOP4S_D : MOP4<"s", "za64", "d", "aarch64_sme_mop4s", "", [ImmCheck<0, ImmCheck0_7>]>;
def SVMLA_MULTI_VG1x2_F64 : Inst<"svmla_za64[_{d}]_vg1x2", "vm22", "d", MergeNone, "aarch64_sme_fmla_vg1x2", [IsStreaming, IsInOutZA], []>;
def SVMLA_MULTI_VG1x4_F64 : Inst<"svmla_za64[_{d}]_vg1x4", "vm44", "d", MergeNone, "aarch64_sme_fmla_vg1x4", [IsStreaming, IsInOutZA], []>;
@@ -539,8 +539,8 @@ let SMETargetGuard = "sme2,sme-f64f64" in {
}
let SMETargetGuard = "sme-f16f16" in {
- defm SVFMOP4A_H : MOP4<"a", "za16", "h", "aarch64_sme_mop4a", "">;
- defm SVFMOP4S_H : MOP4<"s", "za16", "h", "aarch64_sme_mop4s", "">;
+ defm SVFMOP4A_H : MOP4<"a", "za16", "h", "aarch64_sme_mop4a", "", [ImmCheck<0, ImmCheck0_1>]>;
+ defm SVFMOP4S_H : MOP4<"s", "za16", "h", "aarch64_sme_mop4s", "", [ImmCheck<0, ImmCheck0_1>]>;
def SVMLA_MULTI_VG1x2_F16 : Inst<"svmla_za16[_f16]_vg1x2", "vm22", "h", MergeNone, "aarch64_sme_fmla_vg1x2", [IsStreaming, IsInOutZA], []>;
def SVMLA_MULTI_VG1x4_F16 : Inst<"svmla_za16[_f16]_vg1x4", "vm44", "h", MergeNone, "aarch64_sme_fmla_vg1x4", [IsStreaming, IsInOutZA], []>;
@@ -559,8 +559,8 @@ let SMETargetGuard = "sme-f16f16" in {
}
let SMETargetGuard = "sme-b16b16" in {
- defm SVBMOP4A_H : MOP4<"a", "za16", "bf", "aarch64_sme_mop4a", "">;
- defm SVBMOP4S_H : MOP4<"s", "za16", "bf", "aarch64_sme_mop4s", "">;
+ defm SVBMOP4A_H : MOP4<"a", "za16", "bf", "aarch64_sme_mop4a", "", [ImmCheck<0, ImmCheck0_1>]>;
+ defm SVBMOP4S_H : MOP4<"s", "za16", "bf", "aarch64_sme_mop4s", "", [ImmCheck<0, ImmCheck0_1>]>;
def SVMLA_MULTI_VG1x2_BF16 : Inst<"svmla_za16[_bf16]_vg1x2", "vm22", "b", MergeNone, "aarch64_sme_fmla_vg1x2", [IsStreaming, IsInOutZA], []>;
def SVMLA_MULTI_VG1x4_BF16 : Inst<"svmla_za16[_bf16]_vg1x4", "vm44", "b", MergeNone, "aarch64_sme_fmla_vg1x4", [IsStreaming, IsInOutZA], []>;
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_1x1.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_1x1.c
index 34a9633374d3f..521f7900bacd2 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_1x1.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_1x1.c
@@ -18,448 +18,448 @@
// CHECK-LABEL: @test_svmop4a_1x1_za32_s8_s8(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.wide.1x1.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z27test_svmop4a_1x1_za32_s8_s8u10__SVInt8_tS_(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.wide.1x1.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_1x1_za32_s8_s8(svint8_t zn, svint8_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4a_1x1_za32,_s8_s8,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4a_1x1_za32,_s8_s8,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4s_1x1_za32_s8_s8(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.wide.1x1.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z27test_svmop4s_1x1_za32_s8_s8u10__SVInt8_tS_(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.wide.1x1.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4s_1x1_za32_s8_s8(svint8_t zn, svint8_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4s_1x1_za32,_s8_s8,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4s_1x1_za32,_s8_s8,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4a_1x1_za32_u8_u8(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.wide.1x1.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z27test_svmop4a_1x1_za32_u8_u8u11__SVUint8_tS_(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.wide.1x1.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_1x1_za32_u8_u8(svuint8_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4a_1x1_za32,_u8_u8,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4a_1x1_za32,_u8_u8,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4s_1x1_za32_u8_u8(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.wide.1x1.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z27test_svmop4s_1x1_za32_u8_u8u11__SVUint8_tS_(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.wide.1x1.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4s_1x1_za32_u8_u8(svuint8_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4s_1x1_za32,_u8_u8,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4s_1x1_za32,_u8_u8,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4a_1x1_za32_s8_u8(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4a.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4a.wide.1x1.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z27test_svmop4a_1x1_za32_s8_u8u10__SVInt8_tu11__SVUint8_t(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4a.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4a.wide.1x1.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_1x1_za32_s8_u8(svint8_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4a_1x1_za32,_s8_u8,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4a_1x1_za32,_s8_u8,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4s_1x1_za32_s8_u8(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4s.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4s.wide.1x1.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z27test_svmop4s_1x1_za32_s8_u8u10__SVInt8_tu11__SVUint8_t(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4s.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4s.wide.1x1.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4s_1x1_za32_s8_u8(svint8_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4s_1x1_za32,_s8_u8,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4s_1x1_za32,_s8_u8,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4a_1x1_za32_u8_s8(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4a.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4a.wide.1x1.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z27test_svmop4a_1x1_za32_u8_s8u11__SVUint8_tu10__SVInt8_t(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4a.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4a.wide.1x1.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_1x1_za32_u8_s8(svuint8_t zn, svint8_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4a_1x1_za32,_u8_s8,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4a_1x1_za32,_u8_s8,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4s_1x1_za32_u8_s8(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4s.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4s.wide.1x1.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z27test_svmop4s_1x1_za32_u8_s8u11__SVUint8_tu10__SVInt8_t(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4s.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4s.wide.1x1.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4s_1x1_za32_u8_s8(svuint8_t zn, svint8_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4s_1x1_za32,_u8_s8,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4s_1x1_za32,_u8_s8,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4a_1x1_za32_s16_s16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x1_za32_s16_s16u11__SVInt16_tS_(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_1x1_za32_s16_s16(svint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4a_1x1_za32,_s16_s16,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4a_1x1_za32,_s16_s16,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4s_1x1_za32_s16_s16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x1_za32_s16_s16u11__SVInt16_tS_(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4s_1x1_za32_s16_s16(svint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4s_1x1_za32,_s16_s16,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4s_1x1_za32,_s16_s16,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4a_1x1_za32_u16_u16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x1_za32_u16_u16u12__SVUint16_tS_(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_1x1_za32_u16_u16(svuint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4a_1x1_za32,_u16_u16,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4a_1x1_za32,_u16_u16,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4s_1x1_za32_u16_u16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x1_za32_u16_u16u12__SVUint16_tS_(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4s_1x1_za32_u16_u16(svuint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4s_1x1_za32,_u16_u16,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4s_1x1_za32,_u16_u16,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4a_1x1_za32_f16_f16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8f16(i32 3, <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8f16(i32 1, <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x1_za32_f16_f16u13__SVFloat16_tS_(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8f16(i32 3, <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8f16(i32 1, <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_1x1_za32_f16_f16(svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4a_1x1_za32,_f16_f16,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4a_1x1_za32,_f16_f16,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4s_1x1_za32_f16_f16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8f16(i32 3, <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8f16(i32 1, <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x1_za32_f16_f16u13__SVFloat16_tS_(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8f16(i32 3, <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8f16(i32 1, <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4s_1x1_za32_f16_f16(svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4s_1x1_za32,_f16_f16,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4s_1x1_za32,_f16_f16,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4a_1x1_za32_bf16_bf16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8bf16(i32 3, <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8bf16(i32 1, <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z31test_svmop4a_1x1_za32_bf16_bf16u14__SVBfloat16_tS_(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8bf16(i32 3, <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8bf16(i32 1, <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_1x1_za32_bf16_bf16(svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4a_1x1_za32,_bf16_bf16,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4a_1x1_za32,_bf16_bf16,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4s_1x1_za32_bf16_bf16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8bf16(i32 3, <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8bf16(i32 1, <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z31test_svmop4s_1x1_za32_bf16_bf16u14__SVBfloat16_tS_(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8bf16(i32 3, <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8bf16(i32 1, <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4s_1x1_za32_bf16_bf16(svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4s_1x1_za32,_bf16_bf16,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4s_1x1_za32,_bf16_bf16,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4a_1x1_za64_s16_s16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.za64.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.za64.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x1_za64_s16_s16u11__SVInt16_tS_(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.za64.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.za64.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_1x1_za64_s16_s16(svint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4a_1x1_za64,_s16_s16,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4a_1x1_za64,_s16_s16,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4s_1x1_za64_s16_s16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.za64.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.za64.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x1_za64_s16_s16u11__SVInt16_tS_(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.za64.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.za64.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4s_1x1_za64_s16_s16(svint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4s_1x1_za64,_s16_s16,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4s_1x1_za64,_s16_s16,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4a_1x1_za64_u16_u16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.za64.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.za64.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x1_za64_u16_u16u12__SVUint16_tS_(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.za64.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.za64.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_1x1_za64_u16_u16(svuint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4a_1x1_za64,_u16_u16,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4a_1x1_za64,_u16_u16,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4s_1x1_za64_u16_u16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.za64.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.za64.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x1_za64_u16_u16u12__SVUint16_tS_(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.za64.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.za64.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4s_1x1_za64_u16_u16(svuint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4s_1x1_za64,_u16_u16,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4s_1x1_za64,_u16_u16,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4a_1x1_za64_s16_u16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4a.za64.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4a.za64.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x1_za64_s16_u16u11__SVInt16_tu12__SVUint16_t(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4a.za64.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4a.za64.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_1x1_za64_s16_u16(svint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4a_1x1_za64,_s16_u16,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4a_1x1_za64,_s16_u16,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4s_1x1_za64_s16_u16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4s.za64.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4s.za64.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x1_za64_s16_u16u11__SVInt16_tu12__SVUint16_t(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4s.za64.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4s.za64.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4s_1x1_za64_s16_u16(svint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4s_1x1_za64,_s16_u16,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4s_1x1_za64,_s16_u16,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4a_1x1_za64_u16_s16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4a.za64.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4a.za64.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x1_za64_u16_s16u12__SVUint16_tu11__SVInt16_t(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4a.za64.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4a.za64.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_1x1_za64_u16_s16(svuint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4a_1x1_za64,_u16_s16,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4a_1x1_za64,_u16_s16,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4s_1x1_za64_u16_s16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4s.za64.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4s.za64.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x1_za64_u16_s16u12__SVUint16_tu11__SVInt16_t(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4s.za64.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4s.za64.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4s_1x1_za64_u16_s16(svuint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4s_1x1_za64,_u16_s16,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4s_1x1_za64,_u16_s16,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4a_1x1_za16_f16_f16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x1.nxv8f16(i32 3, <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x1.nxv8f16(i32 1, <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x1_za16_f16_f16u13__SVFloat16_tS_(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x1.nxv8f16(i32 3, <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x1.nxv8f16(i32 1, <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_1x1_za16_f16_f16(svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4a_1x1_za16,_f16_f16,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4a_1x1_za16,_f16_f16,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4s_1x1_za16_f16_f16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x1.nxv8f16(i32 3, <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x1.nxv8f16(i32 1, <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x1_za16_f16_f16u13__SVFloat16_tS_(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x1.nxv8f16(i32 3, <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x1.nxv8f16(i32 1, <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4s_1x1_za16_f16_f16(svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4s_1x1_za16,_f16_f16,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4s_1x1_za16,_f16_f16,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4a_1x1_za32_f32_f32(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x1.nxv4f32(i32 3, <vscale x 4 x float> [[ZN:%.*]], <vscale x 4 x float> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x1.nxv4f32(i32 1, <vscale x 4 x float> [[ZN:%.*]], <vscale x 4 x float> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x1_za32_f32_f32u13__SVFloat32_tS_(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x1.nxv4f32(i32 3, <vscale x 4 x float> [[ZN:%.*]], <vscale x 4 x float> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x1.nxv4f32(i32 1, <vscale x 4 x float> [[ZN:%.*]], <vscale x 4 x float> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_1x1_za32_f32_f32(svfloat32_t zn, svfloat32_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4a_1x1_za32,_f32_f32,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4a_1x1_za32,_f32_f32,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4s_1x1_za32_f32_f32(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x1.nxv4f32(i32 3, <vscale x 4 x float> [[ZN:%.*]], <vscale x 4 x float> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x1.nxv4f32(i32 1, <vscale x 4 x float> [[ZN:%.*]], <vscale x 4 x float> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x1_za32_f32_f32u13__SVFloat32_tS_(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x1.nxv4f32(i32 3, <vscale x 4 x float> [[ZN:%.*]], <vscale x 4 x float> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x1.nxv4f32(i32 1, <vscale x 4 x float> [[ZN:%.*]], <vscale x 4 x float> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4s_1x1_za32_f32_f32(svfloat32_t zn, svfloat32_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4s_1x1_za32,_f32_f32,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4s_1x1_za32,_f32_f32,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4a_1x1_za64_f64_f64(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x1.nxv2f64(i32 3, <vscale x 2 x double> [[ZN:%.*]], <vscale x 2 x double> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x1.nxv2f64(i32 1, <vscale x 2 x double> [[ZN:%.*]], <vscale x 2 x double> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x1_za64_f64_f64u13__SVFloat64_tS_(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x1.nxv2f64(i32 3, <vscale x 2 x double> [[ZN:%.*]], <vscale x 2 x double> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x1.nxv2f64(i32 1, <vscale x 2 x double> [[ZN:%.*]], <vscale x 2 x double> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_1x1_za64_f64_f64(svfloat64_t zn, svfloat64_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4a_1x1_za64,_f64_f64,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4a_1x1_za64,_f64_f64,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4s_1x1_za64_f64_f64(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x1.nxv2f64(i32 3, <vscale x 2 x double> [[ZN:%.*]], <vscale x 2 x double> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x1.nxv2f64(i32 1, <vscale x 2 x double> [[ZN:%.*]], <vscale x 2 x double> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x1_za64_f64_f64u13__SVFloat64_tS_(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x1.nxv2f64(i32 3, <vscale x 2 x double> [[ZN:%.*]], <vscale x 2 x double> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x1.nxv2f64(i32 1, <vscale x 2 x double> [[ZN:%.*]], <vscale x 2 x double> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4s_1x1_za64_f64_f64(svfloat64_t zn, svfloat64_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4s_1x1_za64,_f64_f64,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4s_1x1_za64,_f64_f64,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4a_1x1_za16_bf16_bf16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x1.nxv8bf16(i32 3, <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x1.nxv8bf16(i32 1, <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z31test_svmop4a_1x1_za16_bf16_bf16u14__SVBfloat16_tS_(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x1.nxv8bf16(i32 3, <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x1.nxv8bf16(i32 1, <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_1x1_za16_bf16_bf16(svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4a_1x1_za16,_bf16_bf16,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4a_1x1_za16,_bf16_bf16,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4s_1x1_za16_bf16_bf16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x1.nxv8bf16(i32 3, <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x1.nxv8bf16(i32 1, <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z31test_svmop4s_1x1_za16_bf16_bf16u14__SVBfloat16_tS_(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x1.nxv8bf16(i32 3, <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x1.nxv8bf16(i32 1, <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4s_1x1_za16_bf16_bf16(svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4s_1x1_za16,_bf16_bf16,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4s_1x1_za16,_bf16_bf16,)(1, zn, zm);
}
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index 9840d36b2c0fc..d9a25bd51ddfc 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -650,7 +650,7 @@ multiclass sme_quarter_outer_product_i64<bit zn_u, bit zm_u, bit subtr, string m
def NAME # _MZZ_HtoD # _PSEUDO : sme2_quarter_tile_outer_product_pseudo_single_single<ZPR16Mul2_Lo, ZPR16Mul2_Hi, SMEMatrixTileD>, SMEPseudo2Instr<NAME # _MZZ_HtoD, 0>;
- def : SME2_ZA_Tile_TwoVec_Pat<NAME # _MZZ_HtoD, !cast<SDPatternOperator>(op # "_1x1"), timm32_0_3, nxv8i16>;
+ def : SME2_ZA_Tile_TwoVec_Pat<NAME # _MZZ_HtoD, !cast<SDPatternOperator>(op # "_1x1"), timm32_0_7, nxv8i16>;
def _M2ZZ_HtoD : sme_quarter_outer_product_i64<{zn_u, 1}, {zm_u, 0}, subtr,
ZZ_h_mul_r_Lo, ZPR16Mul2_Hi, mnemonic>;
@@ -5654,7 +5654,7 @@ multiclass sme2_fmop4as_fp16_non_widening<bit S, string mnemonic, string op> {
def NAME # _MZZ_H # _PSEUDO : sme2_quarter_tile_outer_product_pseudo_single_single<ZPR16Mul2_Lo, ZPR16Mul2_Hi, SMEMatrixTileH>, SMEPseudo2Instr<NAME # _MZZ_H, 0>;
- def : SME2_ZA_Tile_TwoVec_Pat<NAME # _MZZ_H, !cast<SDPatternOperator>(op # "_1x1"), timm32_0_3, nxv8f16>;
+ def : SME2_ZA_Tile_TwoVec_Pat<NAME # _MZZ_H, !cast<SDPatternOperator>(op # "_1x1"), timm32_0_1, nxv8f16>;
// Multiple and single vectors
def _M2ZZ_H : sme2_fp16_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZPR16Mul2_Hi>;
@@ -5730,7 +5730,7 @@ multiclass sme2_bfmop4as_non_widening<bit S, string mnemonic, string op> {
def NAME # _MZZ_H # _PSEUDO : sme2_quarter_tile_outer_product_pseudo_single_single<ZPR16Mul2_Lo, ZPR16Mul2_Hi, SMEMatrixTileH>, SMEPseudo2Instr<NAME # _MZZ_H, 0>;
- def : SME2_ZA_Tile_TwoVec_Pat<NAME # _MZZ_H, !cast<SDPatternOperator>(op # "_1x1"), timm32_0_3, nxv8bf16>;
+ def : SME2_ZA_Tile_TwoVec_Pat<NAME # _MZZ_H, !cast<SDPatternOperator>(op # "_1x1"), timm32_0_1, nxv8bf16>;
// Multiple and single vectors
def _M2ZZ_H : sme2_bf16_fp16_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZPR16Mul2_Hi>;
@@ -5812,7 +5812,7 @@ multiclass sme2_fmop4as_fp64_non_widening<bit S, string mnemonic, string op> {
def NAME # _MZZ_D # _PSEUDO : sme2_quarter_tile_outer_product_pseudo_single_single<ZPR64Mul2_Lo, ZPR64Mul2_Hi, SMEMatrixTileD>, SMEPseudo2Instr<NAME # _MZZ_D, 0>;
- def : SME2_ZA_Tile_TwoVec_Pat<NAME # _MZZ_D, !cast<SDPatternOperator>(op # "_1x1"), timm32_0_3, nxv2f64>;
+ def : SME2_ZA_Tile_TwoVec_Pat<NAME # _MZZ_D, !cast<SDPatternOperator>(op # "_1x1"), timm32_0_7, nxv2f64>;
// Multiple and single vectors
def _M2ZZ_D : sme2_fp64_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_d_mul_r_Lo, ZPR64Mul2_Hi>;
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_1x1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_1x1.ll
index 7bcf407d23297..6b878c827efaa 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_1x1.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_1x1.ll
@@ -10,7 +10,7 @@ define void @mop4a_za32_s8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) #0 {
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: smop4a za1.s, z0.b, z24.b
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.smop4a.wide.1x1.nxv16i8(i32 1, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
+ call void @llvm.aarch64.sme.smop4a.wide.1x1.nxv16i8(i32 0, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
ret void
}
@@ -20,7 +20,7 @@ define void @mop4s_za32_s8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) #0 {
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: smop4s za1.s, z0.b, z24.b
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.smop4s.wide.1x1.nxv16i8(i32 1, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
+ call void @llvm.aarch64.sme.smop4s.wide.1x1.nxv16i8(i32 0, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
ret void
}
@@ -30,7 +30,7 @@ define void @mop4a_za32_u8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) #0 {
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: umop4a za1.s, z0.b, z24.b
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.umop4a.wide.1x1.nxv16i8(i32 1, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
+ call void @llvm.aarch64.sme.umop4a.wide.1x1.nxv16i8(i32 0, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
ret void
}
@@ -40,7 +40,7 @@ define void @mop4s_za32_u8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) #0 {
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: umop4s za1.s, z0.b, z24.b
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.umop4s.wide.1x1.nxv16i8(i32 1, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
+ call void @llvm.aarch64.sme.umop4s.wide.1x1.nxv16i8(i32 0, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
ret void
}
@@ -50,7 +50,7 @@ define void @mop4a_za32_s8_u8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) #0
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: sumop4a za1.s, z0.b, z24.b
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.sumop4a.wide.1x1.nxv16i8(i32 1, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
+ call void @llvm.aarch64.sme.sumop4a.wide.1x1.nxv16i8(i32 0, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
ret void
}
@@ -60,7 +60,7 @@ define void @mop4s_za32_s8_u8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) #0
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: sumop4s za1.s, z0.b, z24.b
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.sumop4s.wide.1x1.nxv16i8(i32 1, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
+ call void @llvm.aarch64.sme.sumop4s.wide.1x1.nxv16i8(i32 0, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
ret void
}
@@ -70,7 +70,7 @@ define void @mop4a_za32_u8_s8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) #0
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: usmop4a za1.s, z0.b, z24.b
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.usmop4a.wide.1x1.nxv16i8(i32 1, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
+ call void @llvm.aarch64.sme.usmop4a.wide.1x1.nxv16i8(i32 0, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
ret void
}
@@ -80,7 +80,7 @@ define void @mop4s_za32_u8_s8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) #0
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: usmop4s za1.s, z0.b, z24.b
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.usmop4s.wide.1x1.nxv16i8(i32 1, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
+ call void @llvm.aarch64.sme.usmop4s.wide.1x1.nxv16i8(i32 0, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
ret void
}
@@ -90,7 +90,7 @@ define void @mop4a_za32_s16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: smop4a za1.s, z0.h, z24.h
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.smop4a.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ call void @llvm.aarch64.sme.smop4a.wide.1x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
ret void
}
@@ -100,7 +100,7 @@ define void @mop4s_za32_s16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: smop4s za1.s, z0.h, z24.h
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.smop4s.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ call void @llvm.aarch64.sme.smop4s.wide.1x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
ret void
}
@@ -110,7 +110,7 @@ define void @mop4a_za32_u16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: umop4a za1.s, z0.h, z24.h
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.umop4a.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ call void @llvm.aarch64.sme.umop4a.wide.1x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
ret void
}
@@ -120,7 +120,7 @@ define void @mop4s_za32_u16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: umop4s za1.s, z0.h, z24.h
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.umop4s.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ call void @llvm.aarch64.sme.umop4s.wide.1x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
ret void
}
@@ -130,7 +130,7 @@ define void @mop4a_za32_f16(<vscale x 8 x half> %zn, <vscale x 8 x half> %zm) #0
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: fmop4a za1.s, z0.h, z24.h
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8f16(i32 1, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm)
+ call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8f16(i32 0, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm)
ret void
}
@@ -140,7 +140,7 @@ define void @mop4s_za32_f16(<vscale x 8 x half> %zn, <vscale x 8 x half> %zm) #0
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: fmop4s za1.s, z0.h, z24.h
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8f16(i32 1, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm)
+ call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8f16(i32 0, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm)
ret void
}
@@ -150,7 +150,7 @@ define void @mop4a_za32_bf16(<vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %z
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: bfmop4a za1.s, z0.h, z24.h
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8bf16(i32 1, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
+ call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8bf16(i32 0, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
ret void
}
@@ -160,7 +160,7 @@ define void @mop4s_za32_bf16(<vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %z
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: bfmop4s za1.s, z0.h, z24.h
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8bf16(i32 1, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
+ call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8bf16(i32 0, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
ret void
}
@@ -170,7 +170,7 @@ define void @mop4a_za64_s16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: smop4a za1.s, z0.h, z24.h
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.smop4a.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ call void @llvm.aarch64.sme.smop4a.wide.1x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
ret void
}
@@ -180,7 +180,7 @@ define void @mop4s_za64_s16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: smop4s za1.s, z0.h, z24.h
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.smop4s.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ call void @llvm.aarch64.sme.smop4s.wide.1x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
ret void
}
@@ -190,7 +190,7 @@ define void @mop4a_za64_u16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: umop4a za1.s, z0.h, z24.h
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.umop4a.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ call void @llvm.aarch64.sme.umop4a.wide.1x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
ret void
}
@@ -200,7 +200,7 @@ define void @mop4s_za64_u16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: umop4s za1.s, z0.h, z24.h
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.umop4s.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ call void @llvm.aarch64.sme.umop4s.wide.1x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
ret void
}
@@ -210,7 +210,7 @@ define void @mop4a_za64_s16_u16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: sumop4a za1.d, z0.h, z24.h
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.sumop4a.za64.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ call void @llvm.aarch64.sme.sumop4a.za64.wide.1x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
ret void
}
@@ -220,7 +220,7 @@ define void @mop4s_za64_s16_u16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: sumop4s za1.d, z0.h, z24.h
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.sumop4s.za64.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ call void @llvm.aarch64.sme.sumop4s.za64.wide.1x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
ret void
}
@@ -230,7 +230,7 @@ define void @mop4a_za64_u16_s16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: usmop4a za1.d, z0.h, z24.h
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.usmop4a.za64.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ call void @llvm.aarch64.sme.usmop4a.za64.wide.1x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
ret void
}
@@ -240,7 +240,7 @@ define void @mop4s_za64_u16_s16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: usmop4s za1.d, z0.h, z24.h
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.usmop4s.za64.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ call void @llvm.aarch64.sme.usmop4s.za64.wide.1x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
ret void
}
@@ -251,7 +251,7 @@ define void @mop4a_za16_f16(<vscale x 8 x half> %zn, <vscale x 8 x half> %zm) #0
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: fmop4a za1.h, z0.h, z24.h
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.mop4a.1x1.nxv8f16(i32 1, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm)
+ call void @llvm.aarch64.sme.mop4a.1x1.nxv8f16(i32 0, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm)
ret void
}
@@ -261,7 +261,7 @@ define void @mop4s_za16_f16(<vscale x 8 x half> %zn, <vscale x 8 x half> %zm) #0
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: fmop4s za1.h, z0.h, z24.h
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.mop4s.1x1.nxv8f16(i32 1, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm)
+ call void @llvm.aarch64.sme.mop4s.1x1.nxv8f16(i32 0, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm)
ret void
}
@@ -271,7 +271,7 @@ define void @mop4a_za32_f32(<vscale x 4 x float> %zn, <vscale x 4 x float> %zm)
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: fmop4a za1.s, z0.s, z24.s
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.mop4a.1x1.nxv4f32(i32 1, <vscale x 4 x float> %zn, <vscale x 4 x float> %zm)
+ call void @llvm.aarch64.sme.mop4a.1x1.nxv4f32(i32 0, <vscale x 4 x float> %zn, <vscale x 4 x float> %zm)
ret void
}
@@ -281,7 +281,7 @@ define void @mop4s_za32_f32(<vscale x 4 x float> %zn, <vscale x 4 x float> %zm)
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: fmop4s za1.s, z0.s, z24.s
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.mop4s.1x1.nxv4f32(i32 1, <vscale x 4 x float> %zn, <vscale x 4 x float> %zm)
+ call void @llvm.aarch64.sme.mop4s.1x1.nxv4f32(i32 0, <vscale x 4 x float> %zn, <vscale x 4 x float> %zm)
ret void
}
@@ -291,7 +291,7 @@ define void @mop4a_za64_f64(<vscale x 2 x double> %zn, <vscale x 2 x double> %zm
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: fmop4a za1.d, z0.d, z24.d
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.mop4a.1x1.nxv2f64(i32 1, <vscale x 2 x double> %zn, <vscale x 2 x double> %zm)
+ call void @llvm.aarch64.sme.mop4a.1x1.nxv2f64(i32 0, <vscale x 2 x double> %zn, <vscale x 2 x double> %zm)
ret void
}
@@ -301,7 +301,7 @@ define void @mop4s_za64_f64(<vscale x 2 x double> %zn, <vscale x 2 x double> %zm
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: fmop4s za1.d, z0.d, z24.d
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.mop4s.1x1.nxv2f64(i32 1, <vscale x 2 x double> %zn, <vscale x 2 x double> %zm)
+ call void @llvm.aarch64.sme.mop4s.1x1.nxv2f64(i32 0, <vscale x 2 x double> %zn, <vscale x 2 x double> %zm)
ret void
}
@@ -311,7 +311,7 @@ define void @mop4a_za16_bf16(<vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %z
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: bfmop4a za1.h, z0.h, z24.h
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.mop4a.1x1.nxv8bf16(i32 1, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
+ call void @llvm.aarch64.sme.mop4a.1x1.nxv8bf16(i32 0, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
ret void
}
@@ -320,8 +320,100 @@ define void @mop4s_za16_bf16(<vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %z
; CHECK: // %bb.0:
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: bfmop4s za1.h, z0.h, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4s.1x1.nxv8bf16(i32 0, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
+ ret void
+}
+
+; Tile limits
+define void @mop4s_za32_s8_limit(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) #0 {
+; CHECK-LABEL: mop4s_za32_s8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: smop4s za1.s, z0.b, z24.b
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.smop4s.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
+ ret void
+}
+
+define void @mop4s_za32_s16_limit(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
+; CHECK-LABEL: mop4s_za32_s16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: smop4s za1.s, z0.h, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.smop4s.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ ret void
+}
+
+define void @mop4s_za32_f16_limit(<vscale x 8 x half> %zn, <vscale x 8 x half> %zm) #0 {
+; CHECK-LABEL: mop4s_za32_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: fmop4s za1.s, z0.h, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8f16(i32 3, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm)
+ ret void
+}
+
+define void @mop4s_za32_bf16_limit(<vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) #0 {
+; CHECK-LABEL: mop4s_za32_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: bfmop4s za1.s, z0.h, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8bf16(i32 3, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
+ ret void
+}
+
+define void @mop4s_za64_s16_limit(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
+; CHECK-LABEL: mop4s_za64_s16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: smop4s za1.s, z0.h, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.smop4s.wide.1x1.nxv8i16(i32 7, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ ret void
+}
+
+define void @mop4s_za64_f64_limit(<vscale x 2 x double> %zn, <vscale x 2 x double> %zm) #0 {
+; CHECK-LABEL: mop4s_za64_f64_limit:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: fmop4s za7.d, z0.d, z24.d
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4s.1x1.nxv2f64(i32 7, <vscale x 2 x double> %zn, <vscale x 2 x double> %zm)
+ ret void
+}
+
+define void @mop4s_za32_f32_limit(<vscale x 4 x float> %zn, <vscale x 4 x float> %zm) #0 {
+; CHECK-LABEL: mop4s_za32_f32_limit:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: fmop4s za3.s, z0.s, z24.s
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4s.1x1.nxv4f32(i32 3, <vscale x 4 x float> %zn, <vscale x 4 x float> %zm)
+ ret void
+}
+
+define void @mop4s_za16_f16_limit(<vscale x 8 x half> %zn, <vscale x 8 x half> %zm) #0 {
+; CHECK-LABEL: mop4s_za16_f16_limit:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: fmop4s za1.h, z0.h, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4s.1x1.nxv8f16(i32 1, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm)
+ ret void
+}
+
+define void @mop4s_za16_bf16_limit(<vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) #0 {
+; CHECK-LABEL: mop4s_za16_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: bfmop4s za1.h, z0.h, z24.h
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.mop4s.1x1.nxv8bf16(i32 1, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
ret void
}
+
attributes #0 = {nounwind "target-features" = "+sme-i16i64,+sme-f64f64,+sme-b16b16,+sme2p1,+bf16,+sme-f16f16,+sme-mop4" }
>From eb39b7113c928ae739307d4c684dbf887842645b Mon Sep 17 00:00:00 2001
From: Virginia Cangelosi <virginia.cangelosi at arm.com>
Date: Wed, 26 Feb 2025 16:13:26 +0000
Subject: [PATCH 07/11] Fix llvm test
---
.../AArch64/sme2-intrinsics-mop4a_1x1.ll | 96 +++++++++----------
1 file changed, 48 insertions(+), 48 deletions(-)
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_1x1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_1x1.ll
index 6b878c827efaa..ec899fab7cf21 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_1x1.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_1x1.ll
@@ -8,7 +8,7 @@ define void @mop4a_za32_s8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) #0 {
; CHECK-LABEL: mop4a_za32_s8:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: smop4a za1.s, z0.b, z24.b
+; CHECK-NEXT: smop4a za0.s, z0.b, z24.b
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.smop4a.wide.1x1.nxv16i8(i32 0, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
ret void
@@ -18,7 +18,7 @@ define void @mop4s_za32_s8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) #0 {
; CHECK-LABEL: mop4s_za32_s8:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: smop4s za1.s, z0.b, z24.b
+; CHECK-NEXT: smop4s za0.s, z0.b, z24.b
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.smop4s.wide.1x1.nxv16i8(i32 0, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
ret void
@@ -28,7 +28,7 @@ define void @mop4a_za32_u8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) #0 {
; CHECK-LABEL: mop4a_za32_u8:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: umop4a za1.s, z0.b, z24.b
+; CHECK-NEXT: umop4a za0.s, z0.b, z24.b
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.umop4a.wide.1x1.nxv16i8(i32 0, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
ret void
@@ -38,7 +38,7 @@ define void @mop4s_za32_u8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) #0 {
; CHECK-LABEL: mop4s_za32_u8:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: umop4s za1.s, z0.b, z24.b
+; CHECK-NEXT: umop4s za0.s, z0.b, z24.b
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.umop4s.wide.1x1.nxv16i8(i32 0, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
ret void
@@ -48,7 +48,7 @@ define void @mop4a_za32_s8_u8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) #0
; CHECK-LABEL: mop4a_za32_s8_u8:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: sumop4a za1.s, z0.b, z24.b
+; CHECK-NEXT: sumop4a za0.s, z0.b, z24.b
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.sumop4a.wide.1x1.nxv16i8(i32 0, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
ret void
@@ -58,7 +58,7 @@ define void @mop4s_za32_s8_u8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) #0
; CHECK-LABEL: mop4s_za32_s8_u8:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: sumop4s za1.s, z0.b, z24.b
+; CHECK-NEXT: sumop4s za0.s, z0.b, z24.b
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.sumop4s.wide.1x1.nxv16i8(i32 0, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
ret void
@@ -68,7 +68,7 @@ define void @mop4a_za32_u8_s8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) #0
; CHECK-LABEL: mop4a_za32_u8_s8:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: usmop4a za1.s, z0.b, z24.b
+; CHECK-NEXT: usmop4a za0.s, z0.b, z24.b
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.usmop4a.wide.1x1.nxv16i8(i32 0, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
ret void
@@ -78,7 +78,7 @@ define void @mop4s_za32_u8_s8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) #0
; CHECK-LABEL: mop4s_za32_u8_s8:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: usmop4s za1.s, z0.b, z24.b
+; CHECK-NEXT: usmop4s za0.s, z0.b, z24.b
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.usmop4s.wide.1x1.nxv16i8(i32 0, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
ret void
@@ -88,7 +88,7 @@ define void @mop4a_za32_s16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
; CHECK-LABEL: mop4a_za32_s16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: smop4a za1.s, z0.h, z24.h
+; CHECK-NEXT: smop4a za0.s, z0.h, z24.h
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.smop4a.wide.1x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
ret void
@@ -98,7 +98,7 @@ define void @mop4s_za32_s16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
; CHECK-LABEL: mop4s_za32_s16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: smop4s za1.s, z0.h, z24.h
+; CHECK-NEXT: smop4s za0.s, z0.h, z24.h
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.smop4s.wide.1x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
ret void
@@ -108,7 +108,7 @@ define void @mop4a_za32_u16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
; CHECK-LABEL: mop4a_za32_u16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: umop4a za1.s, z0.h, z24.h
+; CHECK-NEXT: umop4a za0.s, z0.h, z24.h
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.umop4a.wide.1x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
ret void
@@ -118,7 +118,7 @@ define void @mop4s_za32_u16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
; CHECK-LABEL: mop4s_za32_u16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: umop4s za1.s, z0.h, z24.h
+; CHECK-NEXT: umop4s za0.s, z0.h, z24.h
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.umop4s.wide.1x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
ret void
@@ -128,7 +128,7 @@ define void @mop4a_za32_f16(<vscale x 8 x half> %zn, <vscale x 8 x half> %zm) #0
; CHECK-LABEL: mop4a_za32_f16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: fmop4a za1.s, z0.h, z24.h
+; CHECK-NEXT: fmop4a za0.s, z0.h, z24.h
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8f16(i32 0, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm)
ret void
@@ -138,7 +138,7 @@ define void @mop4s_za32_f16(<vscale x 8 x half> %zn, <vscale x 8 x half> %zm) #0
; CHECK-LABEL: mop4s_za32_f16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: fmop4s za1.s, z0.h, z24.h
+; CHECK-NEXT: fmop4s za0.s, z0.h, z24.h
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8f16(i32 0, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm)
ret void
@@ -148,7 +148,7 @@ define void @mop4a_za32_bf16(<vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %z
; CHECK-LABEL: mop4a_za32_bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: bfmop4a za1.s, z0.h, z24.h
+; CHECK-NEXT: bfmop4a za0.s, z0.h, z24.h
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8bf16(i32 0, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
ret void
@@ -158,7 +158,7 @@ define void @mop4s_za32_bf16(<vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %z
; CHECK-LABEL: mop4s_za32_bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: bfmop4s za1.s, z0.h, z24.h
+; CHECK-NEXT: bfmop4s za0.s, z0.h, z24.h
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8bf16(i32 0, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
ret void
@@ -168,9 +168,9 @@ define void @mop4a_za64_s16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
; CHECK-LABEL: mop4a_za64_s16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: smop4a za1.s, z0.h, z24.h
+; CHECK-NEXT: smop4a za0.d, z0.h, z24.h
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.smop4a.wide.1x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ call void @llvm.aarch64.sme.smop4a.za64.wide.1x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
ret void
}
@@ -178,9 +178,9 @@ define void @mop4s_za64_s16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
; CHECK-LABEL: mop4s_za64_s16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: smop4s za1.s, z0.h, z24.h
+; CHECK-NEXT: smop4s za0.d, z0.h, z24.h
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.smop4s.wide.1x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ call void @llvm.aarch64.sme.smop4s.za64.wide.1x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
ret void
}
@@ -188,9 +188,9 @@ define void @mop4a_za64_u16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
; CHECK-LABEL: mop4a_za64_u16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: umop4a za1.s, z0.h, z24.h
+; CHECK-NEXT: umop4a za0.d, z0.h, z24.h
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.umop4a.wide.1x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ call void @llvm.aarch64.sme.umop4a.za64.wide.1x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
ret void
}
@@ -198,9 +198,9 @@ define void @mop4s_za64_u16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
; CHECK-LABEL: mop4s_za64_u16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: umop4s za1.s, z0.h, z24.h
+; CHECK-NEXT: umop4s za0.d, z0.h, z24.h
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.umop4s.wide.1x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ call void @llvm.aarch64.sme.umop4s.za64.wide.1x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
ret void
}
@@ -208,7 +208,7 @@ define void @mop4a_za64_s16_u16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
; CHECK-LABEL: mop4a_za64_s16_u16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: sumop4a za1.d, z0.h, z24.h
+; CHECK-NEXT: sumop4a za0.d, z0.h, z24.h
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.sumop4a.za64.wide.1x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
ret void
@@ -218,7 +218,7 @@ define void @mop4s_za64_s16_u16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
; CHECK-LABEL: mop4s_za64_s16_u16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: sumop4s za1.d, z0.h, z24.h
+; CHECK-NEXT: sumop4s za0.d, z0.h, z24.h
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.sumop4s.za64.wide.1x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
ret void
@@ -228,7 +228,7 @@ define void @mop4a_za64_u16_s16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
; CHECK-LABEL: mop4a_za64_u16_s16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: usmop4a za1.d, z0.h, z24.h
+; CHECK-NEXT: usmop4a za0.d, z0.h, z24.h
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.usmop4a.za64.wide.1x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
ret void
@@ -238,7 +238,7 @@ define void @mop4s_za64_u16_s16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
; CHECK-LABEL: mop4s_za64_u16_s16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: usmop4s za1.d, z0.h, z24.h
+; CHECK-NEXT: usmop4s za0.d, z0.h, z24.h
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.usmop4s.za64.wide.1x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
ret void
@@ -249,7 +249,7 @@ define void @mop4a_za16_f16(<vscale x 8 x half> %zn, <vscale x 8 x half> %zm) #0
; CHECK-LABEL: mop4a_za16_f16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: fmop4a za1.h, z0.h, z24.h
+; CHECK-NEXT: fmop4a za0.h, z0.h, z24.h
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.mop4a.1x1.nxv8f16(i32 0, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm)
ret void
@@ -259,7 +259,7 @@ define void @mop4s_za16_f16(<vscale x 8 x half> %zn, <vscale x 8 x half> %zm) #0
; CHECK-LABEL: mop4s_za16_f16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: fmop4s za1.h, z0.h, z24.h
+; CHECK-NEXT: fmop4s za0.h, z0.h, z24.h
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.mop4s.1x1.nxv8f16(i32 0, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm)
ret void
@@ -269,7 +269,7 @@ define void @mop4a_za32_f32(<vscale x 4 x float> %zn, <vscale x 4 x float> %zm)
; CHECK-LABEL: mop4a_za32_f32:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: fmop4a za1.s, z0.s, z24.s
+; CHECK-NEXT: fmop4a za0.s, z0.s, z24.s
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.mop4a.1x1.nxv4f32(i32 0, <vscale x 4 x float> %zn, <vscale x 4 x float> %zm)
ret void
@@ -279,7 +279,7 @@ define void @mop4s_za32_f32(<vscale x 4 x float> %zn, <vscale x 4 x float> %zm)
; CHECK-LABEL: mop4s_za32_f32:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: fmop4s za1.s, z0.s, z24.s
+; CHECK-NEXT: fmop4s za0.s, z0.s, z24.s
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.mop4s.1x1.nxv4f32(i32 0, <vscale x 4 x float> %zn, <vscale x 4 x float> %zm)
ret void
@@ -289,7 +289,7 @@ define void @mop4a_za64_f64(<vscale x 2 x double> %zn, <vscale x 2 x double> %zm
; CHECK-LABEL: mop4a_za64_f64:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: fmop4a za1.d, z0.d, z24.d
+; CHECK-NEXT: fmop4a za0.d, z0.d, z24.d
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.mop4a.1x1.nxv2f64(i32 0, <vscale x 2 x double> %zn, <vscale x 2 x double> %zm)
ret void
@@ -299,7 +299,7 @@ define void @mop4s_za64_f64(<vscale x 2 x double> %zn, <vscale x 2 x double> %zm
; CHECK-LABEL: mop4s_za64_f64:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: fmop4s za1.d, z0.d, z24.d
+; CHECK-NEXT: fmop4s za0.d, z0.d, z24.d
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.mop4s.1x1.nxv2f64(i32 0, <vscale x 2 x double> %zn, <vscale x 2 x double> %zm)
ret void
@@ -309,7 +309,7 @@ define void @mop4a_za16_bf16(<vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %z
; CHECK-LABEL: mop4a_za16_bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: bfmop4a za1.h, z0.h, z24.h
+; CHECK-NEXT: bfmop4a za0.h, z0.h, z24.h
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.mop4a.1x1.nxv8bf16(i32 0, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
ret void
@@ -319,7 +319,7 @@ define void @mop4s_za16_bf16(<vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %z
; CHECK-LABEL: mop4s_za16_bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: bfmop4s za1.h, z0.h, z24.h
+; CHECK-NEXT: bfmop4s za0.h, z0.h, z24.h
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.mop4s.1x1.nxv8bf16(i32 0, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
ret void
@@ -327,52 +327,52 @@ define void @mop4s_za16_bf16(<vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %z
; Tile limits
define void @mop4s_za32_s8_limit(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) #0 {
-; CHECK-LABEL: mop4s_za32_s8:
+; CHECK-LABEL: mop4s_za32_s8_limit:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: smop4s za1.s, z0.b, z24.b
+; CHECK-NEXT: smop4s za3.s, z0.b, z24.b
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.smop4s.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
ret void
}
define void @mop4s_za32_s16_limit(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
-; CHECK-LABEL: mop4s_za32_s16:
+; CHECK-LABEL: mop4s_za32_s16_limit:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: smop4s za1.s, z0.h, z24.h
+; CHECK-NEXT: smop4s za3.s, z0.h, z24.h
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.smop4s.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
ret void
}
define void @mop4s_za32_f16_limit(<vscale x 8 x half> %zn, <vscale x 8 x half> %zm) #0 {
-; CHECK-LABEL: mop4s_za32_f16:
+; CHECK-LABEL: mop4s_za32_f16_limit:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: fmop4s za1.s, z0.h, z24.h
+; CHECK-NEXT: fmop4s za3.s, z0.h, z24.h
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8f16(i32 3, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm)
ret void
}
define void @mop4s_za32_bf16_limit(<vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) #0 {
-; CHECK-LABEL: mop4s_za32_bf16:
+; CHECK-LABEL: mop4s_za32_bf16_limit:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: bfmop4s za1.s, z0.h, z24.h
+; CHECK-NEXT: bfmop4s za3.s, z0.h, z24.h
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8bf16(i32 3, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
ret void
}
define void @mop4s_za64_s16_limit(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
-; CHECK-LABEL: mop4s_za64_s16:
+; CHECK-LABEL: mop4s_za64_s16_limit:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: smop4s za1.s, z0.h, z24.h
+; CHECK-NEXT: smop4s za7.d, z0.h, z24.h
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.smop4s.wide.1x1.nxv8i16(i32 7, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ call void @llvm.aarch64.sme.smop4s.za64.wide.1x1.nxv8i16(i32 7, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
ret void
}
@@ -407,7 +407,7 @@ define void @mop4s_za16_f16_limit(<vscale x 8 x half> %zn, <vscale x 8 x half> %
}
define void @mop4s_za16_bf16_limit(<vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) #0 {
-; CHECK-LABEL: mop4s_za16_bf16:
+; CHECK-LABEL: mop4s_za16_bf16_limit:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: bfmop4s za1.h, z0.h, z24.h
>From d76cbdc056df9795df9d14584968f8ddf18b3e20 Mon Sep 17 00:00:00 2001
From: Virginia Cangelosi <virginia.cangelosi at arm.com>
Date: Wed, 26 Feb 2025 10:33:32 +0000
Subject: [PATCH 08/11] [Clang][LLVM] Implement single-multi vectors MOP4{A/S}
---
clang/include/clang/Basic/arm_sve.td | 4 +-
.../sme2-intrinsics/acle_sme2_mop4_1x2.c | 466 ++++++++++++++++++
clang/utils/TableGen/SveEmitter.cpp | 4 +-
llvm/include/llvm/IR/IntrinsicsAArch64.td | 35 ++
llvm/lib/Target/AArch64/SMEInstrFormats.td | 70 ++-
.../AArch64/sme2-intrinsics-mop4a_1x2.ll | 361 ++++++++++++++
6 files changed, 928 insertions(+), 12 deletions(-)
create mode 100644 clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_1x2.c
create mode 100644 llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_1x2.ll
diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index b20383e72e66a..c5cf478ea6e9b 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -2420,8 +2420,8 @@ let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2" in {
let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2" in {
def SVSUNPK_X2 : SInst<"svunpk_{d}[_{1}_x2]", "2h", "sil", MergeNone, "aarch64_sve_sunpk_x2", [IsStreaming], []>;
def SVUUNPK_X2 : SInst<"svunpk_{d}[_{1}_x2]", "2h", "UsUiUl", MergeNone, "aarch64_sve_uunpk_x2", [IsStreaming], []>;
- def SVSUNPK_X4 : SInst<"svunpk_{d}[_{3}_x4]", "42.h", "sil", MergeNone, "aarch64_sve_sunpk_x4", [IsStreaming], []>;
- def SVUUNPK_X4 : SInst<"svunpk_{d}[_{3}_x4]", "42.h", "UsUiUl", MergeNone, "aarch64_sve_uunpk_x4", [IsStreaming], []>;
+ def SVSUNPK_X4 : SInst<"svunpk_{d}[_{1}_x4]", "42.h", "sil", MergeNone, "aarch64_sve_sunpk_x4", [IsStreaming], []>;
+ def SVUUNPK_X4 : SInst<"svunpk_{d}[_{1}_x4]", "42.h", "UsUiUl", MergeNone, "aarch64_sve_uunpk_x4", [IsStreaming], []>;
}
let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2,fp8" in {
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_1x2.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_1x2.c
new file mode 100644
index 0000000000000..624fcd9281d83
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_1x2.c
@@ -0,0 +1,466 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+
+#include <arm_sme.h>
+
+#ifdef SME_OVERLOADED_FORMS
+#define SME_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
+#else
+#define SME_ACLE_FUNC(A1,A2,A3) A1##A2##A3
+#endif
+
+// CHECK-LABEL: @test_svmop4a_1x2_za32_s8_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.wide.1x2.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z27test_svmop4a_1x2_za32_s8_s8u10__SVInt8_t10svint8x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.wide.1x2.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_1x2_za32_s8_s8(svint8_t zn, svint8x2_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_1x2_za32,_s8_s8,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_1x2_za32_s8_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.wide.1x2.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z27test_svmop4s_1x2_za32_s8_s8u10__SVInt8_t10svint8x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.wide.1x2.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4s_1x2_za32_s8_s8(svint8_t zn, svint8x2_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s_1x2_za32,_s8_s8,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_1x2_za32_u8_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.wide.1x2.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z27test_svmop4a_1x2_za32_u8_u8u11__SVUint8_t11svuint8x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.wide.1x2.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_1x2_za32_u8_u8(svuint8_t zn, svuint8x2_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_1x2_za32,_u8_u8,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_1x2_za32_u8_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.wide.1x2.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z27test_svmop4s_1x2_za32_u8_u8u11__SVUint8_t11svuint8x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.wide.1x2.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4s_1x2_za32_u8_u8(svuint8_t zn, svuint8x2_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s_1x2_za32,_u8_u8,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_1x2_za32_s8_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4a.wide.1x2.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z27test_svmop4a_1x2_za32_s8_u8u10__SVInt8_t11svuint8x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4a.wide.1x2.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_1x2_za32_s8_u8(svint8_t zn, svuint8x2_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_1x2_za32,_s8_u8,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_1x2_za32_s8_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4s.wide.1x2.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z27test_svmop4s_1x2_za32_s8_u8u10__SVInt8_t11svuint8x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4s.wide.1x2.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4s_1x2_za32_s8_u8(svint8_t zn, svuint8x2_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s_1x2_za32,_s8_u8,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_1x2_za32_u8_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4a.wide.1x2.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z27test_svmop4a_1x2_za32_u8_s8u11__SVUint8_t10svint8x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4a.wide.1x2.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_1x2_za32_u8_s8(svuint8_t zn, svint8x2_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_1x2_za32,_u8_s8,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_1x2_za32_u8_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4s.wide.1x2.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z27test_svmop4s_1x2_za32_u8_s8u11__SVUint8_t10svint8x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4s.wide.1x2.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4s_1x2_za32_u8_s8(svuint8_t zn, svint8x2_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s_1x2_za32,_u8_s8,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_1x2_za32_s16_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.wide.1x2.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x2_za32_s16_s16u11__SVInt16_t11svint16x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.wide.1x2.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_1x2_za32_s16_s16(svint16_t zn, svint16x2_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_1x2_za32,_s16_s16,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_1x2_za32_s16_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.wide.1x2.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x2_za32_s16_s16u11__SVInt16_t11svint16x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.wide.1x2.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4s_1x2_za32_s16_s16(svint16_t zn, svint16x2_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s_1x2_za32,_s16_s16,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_1x2_za32_u16_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.wide.1x2.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x2_za32_u16_u16u12__SVUint16_t12svuint16x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.wide.1x2.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_1x2_za32_u16_u16(svuint16_t zn, svuint16x2_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_1x2_za32,_u16_u16,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_1x2_za32_u16_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.wide.1x2.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x2_za32_u16_u16u12__SVUint16_t12svuint16x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.wide.1x2.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4s_1x2_za32_u16_u16(svuint16_t zn, svuint16x2_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s_1x2_za32,_u16_u16,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_1x2_za32_f16_f16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x2.nxv8f16(i32 3, <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM_COERCE0:%.*]], <vscale x 8 x half> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x2_za32_f16_f16u13__SVFloat16_t13svfloat16x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x2.nxv8f16(i32 3, <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM_COERCE0:%.*]], <vscale x 8 x half> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_1x2_za32_f16_f16(svfloat16_t zn, svfloat16x2_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_1x2_za32,_f16_f16,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_1x2_za32_f16_f16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x2.nxv8f16(i32 3, <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM_COERCE0:%.*]], <vscale x 8 x half> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x2_za32_f16_f16u13__SVFloat16_t13svfloat16x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x2.nxv8f16(i32 3, <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM_COERCE0:%.*]], <vscale x 8 x half> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4s_1x2_za32_f16_f16(svfloat16_t zn, svfloat16x2_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s_1x2_za32,_f16_f16,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_1x2_za32_bf16_bf16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x2.nxv8bf16(i32 3, <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svmop4a_1x2_za32_bf16_bf16u14__SVBfloat16_t14svbfloat16x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x2.nxv8bf16(i32 3, <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_1x2_za32_bf16_bf16(svbfloat16_t zn, svbfloat16x2_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_1x2_za32,_bf16_bf16,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_1x2_za32_bf16_bf16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x2.nxv8bf16(i32 3, <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svmop4s_1x2_za32_bf16_bf16u14__SVBfloat16_t14svbfloat16x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x2.nxv8bf16(i32 3, <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4s_1x2_za32_bf16_bf16(svbfloat16_t zn, svbfloat16x2_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s_1x2_za32,_bf16_bf16,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_1x2_za64_s16_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.za64.wide.1x2.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x2_za64_s16_s16u11__SVInt16_t11svint16x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.za64.wide.1x2.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_1x2_za64_s16_s16(svint16_t zn, svint16x2_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_1x2_za64,_s16_s16,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_1x2_za64_s16_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.za64.wide.1x2.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x2_za64_s16_s16u11__SVInt16_t11svint16x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.za64.wide.1x2.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4s_1x2_za64_s16_s16(svint16_t zn, svint16x2_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s_1x2_za64,_s16_s16,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_1x2_za64_u16_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.za64.wide.1x2.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x2_za64_u16_u16u12__SVUint16_t12svuint16x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.za64.wide.1x2.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_1x2_za64_u16_u16(svuint16_t zn, svuint16x2_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_1x2_za64,_u16_u16,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_1x2_za64_u16_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.za64.wide.1x2.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x2_za64_u16_u16u12__SVUint16_t12svuint16x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.za64.wide.1x2.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4s_1x2_za64_u16_u16(svuint16_t zn, svuint16x2_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s_1x2_za64,_u16_u16,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_1x2_za64_s16_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4a.za64.wide.1x2.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x2_za64_s16_u16u11__SVInt16_t12svuint16x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4a.za64.wide.1x2.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_1x2_za64_s16_u16(svint16_t zn, svuint16x2_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_1x2_za64,_s16_u16,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_1x2_za64_s16_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4s.za64.wide.1x2.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x2_za64_s16_u16u11__SVInt16_t12svuint16x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4s.za64.wide.1x2.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4s_1x2_za64_s16_u16(svint16_t zn, svuint16x2_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s_1x2_za64,_s16_u16,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_1x2_za64_u16_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4a.za64.wide.1x2.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x2_za64_u16_s16u12__SVUint16_t11svint16x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4a.za64.wide.1x2.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_1x2_za64_u16_s16(svuint16_t zn, svint16x2_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_1x2_za64,_u16_s16,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_1x2_za64_u16_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4s.za64.wide.1x2.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x2_za64_u16_s16u12__SVUint16_t11svint16x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4s.za64.wide.1x2.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4s_1x2_za64_u16_s16(svuint16_t zn, svint16x2_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s_1x2_za64,_u16_s16,)(3, zn, zm);
+}
+
+
+// CHECK-LABEL: @test_svmop4a_1x2_za16_f16_f16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x2.nxv8f16(i32 3, <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM_COERCE0:%.*]], <vscale x 8 x half> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x2_za16_f16_f16u13__SVFloat16_t13svfloat16x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x2.nxv8f16(i32 3, <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM_COERCE0:%.*]], <vscale x 8 x half> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_1x2_za16_f16_f16(svfloat16_t zn, svfloat16x2_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_1x2_za16,_f16_f16,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_1x2_za16_f16_f16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x2.nxv8f16(i32 3, <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM_COERCE0:%.*]], <vscale x 8 x half> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x2_za16_f16_f16u13__SVFloat16_t13svfloat16x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x2.nxv8f16(i32 3, <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM_COERCE0:%.*]], <vscale x 8 x half> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4s_1x2_za16_f16_f16(svfloat16_t zn, svfloat16x2_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s_1x2_za16,_f16_f16,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_1x2_za32_f32_f32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x2.nxv4f32(i32 3, <vscale x 4 x float> [[ZN:%.*]], <vscale x 4 x float> [[ZM_COERCE0:%.*]], <vscale x 4 x float> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x2_za32_f32_f32u13__SVFloat32_t13svfloat32x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x2.nxv4f32(i32 3, <vscale x 4 x float> [[ZN:%.*]], <vscale x 4 x float> [[ZM_COERCE0:%.*]], <vscale x 4 x float> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_1x2_za32_f32_f32(svfloat32_t zn, svfloat32x2_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_1x2_za32,_f32_f32,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_1x2_za32_f32_f32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x2.nxv4f32(i32 3, <vscale x 4 x float> [[ZN:%.*]], <vscale x 4 x float> [[ZM_COERCE0:%.*]], <vscale x 4 x float> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x2_za32_f32_f32u13__SVFloat32_t13svfloat32x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x2.nxv4f32(i32 3, <vscale x 4 x float> [[ZN:%.*]], <vscale x 4 x float> [[ZM_COERCE0:%.*]], <vscale x 4 x float> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4s_1x2_za32_f32_f32(svfloat32_t zn, svfloat32x2_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s_1x2_za32,_f32_f32,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_1x2_za64_f64_f64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x2.nxv2f64(i32 3, <vscale x 2 x double> [[ZN:%.*]], <vscale x 2 x double> [[ZM_COERCE0:%.*]], <vscale x 2 x double> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x2_za64_f64_f64u13__SVFloat64_t13svfloat64x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x2.nxv2f64(i32 3, <vscale x 2 x double> [[ZN:%.*]], <vscale x 2 x double> [[ZM_COERCE0:%.*]], <vscale x 2 x double> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_1x2_za64_f64_f64(svfloat64_t zn, svfloat64x2_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_1x2_za64,_f64_f64,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_1x2_za64_f64_f64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x2.nxv2f64(i32 3, <vscale x 2 x double> [[ZN:%.*]], <vscale x 2 x double> [[ZM_COERCE0:%.*]], <vscale x 2 x double> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x2_za64_f64_f64u13__SVFloat64_t13svfloat64x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x2.nxv2f64(i32 3, <vscale x 2 x double> [[ZN:%.*]], <vscale x 2 x double> [[ZM_COERCE0:%.*]], <vscale x 2 x double> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4s_1x2_za64_f64_f64(svfloat64_t zn, svfloat64x2_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s_1x2_za64,_f64_f64,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_1x2_za16_bf16_bf16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x2.nxv8bf16(i32 3, <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svmop4a_1x2_za16_bf16_bf16u14__SVBfloat16_t14svbfloat16x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x2.nxv8bf16(i32 3, <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_1x2_za16_bf16_bf16(svbfloat16_t zn, svbfloat16x2_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_1x2_za16,_bf16_bf16,)(3, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_1x2_za16_bf16_bf16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x2.nxv8bf16(i32 3, <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svmop4s_1x2_za16_bf16_bf16u14__SVBfloat16_t14svbfloat16x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x2.nxv8bf16(i32 3, <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4s_1x2_za16_bf16_bf16(svbfloat16_t zn, svbfloat16x2_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s_1x2_za16,_bf16_bf16,)(3, zn, zm);
+}
diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp
index e226987b4844b..63452d26654be 100644
--- a/clang/utils/TableGen/SveEmitter.cpp
+++ b/clang/utils/TableGen/SveEmitter.cpp
@@ -1043,7 +1043,9 @@ std::string Intrinsic::replaceTemplatedArgs(std::string Name, TypeSpec TS,
case '1':
case '2':
case '3':
- T = SVEType(TS, Proto[C - '0']);
+ // Extract the modifier before passing to SVEType to handle numeric modifiers
+ auto [Mod, NumVectors] = getProtoModifier(Proto, (C - '0'));
+ T = SVEType(TS, Mod);
break;
}
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index eeea8d77d1e16..222c3d63cd68c 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -3099,6 +3099,41 @@ let TargetPrefix = "aarch64" in {
def int_aarch64_sme_usmop4a_za64_wide_1x1 : SME_OuterProduct_QuaterTile_Single;
def int_aarch64_sme_usmop4s_za64_wide_1x1 : SME_OuterProduct_QuaterTile_Single;
+ class SME_OuterProduct_QuaterTile_Multi
+ : DefaultAttrsIntrinsic<[],
+ [llvm_i32_ty,
+ llvm_anyvector_ty,
+ LLVMMatchType<0>,
+ LLVMMatchType<0>], [ImmArg<ArgIndex<0>>]>;
+ def int_aarch64_sme_mop4a_wide_1x2 : SME_OuterProduct_QuaterTile_Multi;
+ def int_aarch64_sme_mop4s_wide_1x2 : SME_OuterProduct_QuaterTile_Multi;
+ def int_aarch64_sme_mop4a_1x2 : SME_OuterProduct_QuaterTile_Multi;
+ def int_aarch64_sme_mop4s_1x2 : SME_OuterProduct_QuaterTile_Multi;
+ def int_aarch64_sme_smop4a_wide_1x2 : SME_OuterProduct_QuaterTile_Multi;
+ def int_aarch64_sme_smop4s_wide_1x2 : SME_OuterProduct_QuaterTile_Multi;
+ def int_aarch64_sme_smop4a_1x2 : SME_OuterProduct_QuaterTile_Multi;
+ def int_aarch64_sme_smop4s_1x2 : SME_OuterProduct_QuaterTile_Multi;
+ def int_aarch64_sme_umop4a_wide_1x2 : SME_OuterProduct_QuaterTile_Multi;
+ def int_aarch64_sme_umop4s_wide_1x2 : SME_OuterProduct_QuaterTile_Multi;
+ def int_aarch64_sme_umop4a_1x2 : SME_OuterProduct_QuaterTile_Multi;
+ def int_aarch64_sme_umop4s_1x2 : SME_OuterProduct_QuaterTile_Multi;
+ def int_aarch64_sme_sumop4a_wide_1x2 : SME_OuterProduct_QuaterTile_Multi;
+ def int_aarch64_sme_sumop4s_wide_1x2 : SME_OuterProduct_QuaterTile_Multi;
+ def int_aarch64_sme_sumop4a_1x2 : SME_OuterProduct_QuaterTile_Multi;
+ def int_aarch64_sme_sumop4s_1x2 : SME_OuterProduct_QuaterTile_Multi;
+ def int_aarch64_sme_usmop4a_wide_1x2 : SME_OuterProduct_QuaterTile_Multi;
+ def int_aarch64_sme_usmop4s_wide_1x2 : SME_OuterProduct_QuaterTile_Multi;
+ def int_aarch64_sme_usmop4a_1x2 : SME_OuterProduct_QuaterTile_Multi;
+ def int_aarch64_sme_usmop4s_1x2 : SME_OuterProduct_QuaterTile_Multi;
+ def int_aarch64_sme_smop4a_za64_wide_1x2 : SME_OuterProduct_QuaterTile_Multi;
+ def int_aarch64_sme_smop4s_za64_wide_1x2 : SME_OuterProduct_QuaterTile_Multi;
+ def int_aarch64_sme_umop4a_za64_wide_1x2 : SME_OuterProduct_QuaterTile_Multi;
+ def int_aarch64_sme_umop4s_za64_wide_1x2 : SME_OuterProduct_QuaterTile_Multi;
+ def int_aarch64_sme_sumop4a_za64_wide_1x2 : SME_OuterProduct_QuaterTile_Multi;
+ def int_aarch64_sme_sumop4s_za64_wide_1x2 : SME_OuterProduct_QuaterTile_Multi;
+ def int_aarch64_sme_usmop4a_za64_wide_1x2 : SME_OuterProduct_QuaterTile_Multi;
+ def int_aarch64_sme_usmop4s_za64_wide_1x2 : SME_OuterProduct_QuaterTile_Multi;
+
class SME_AddVectorToTile_Intrinsic
: DefaultAttrsIntrinsic<[],
[llvm_i32_ty,
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index d9a25bd51ddfc..8ac49fc2dedfb 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -113,6 +113,15 @@ class sme2_quarter_tile_outer_product_pseudo_single_single<ZPRRegOp zn_ty, ZPRRe
let usesCustomInserter = 1;
}
+class sme2_quarter_tile_outer_product_pseudo_single_multi<ZPRRegOp zn_ty, RegisterOperand zm_ty, SMEMatrixTypeEnum za_flag>
+ : Pseudo<(outs), (ins i32imm:$tile,
+ zn_ty:$zn, zm_ty:$zm), []>,
+ Sched<[]> {
+ // Translated to the actual instructions in AArch64ISelLowering.cpp
+ let SMEMatrixType = za_flag;
+ let usesCustomInserter = 1;
+}
+
class sme2_za_array_2op_multi_single_pseudo<string name, Operand index_ty, RegisterOperand multi_vector_ty,
ZPRRegOp zpr_ty, SMEMatrixTypeEnum za_flag>
: SMEPseudo2Instr<name, 0>,
@@ -270,6 +279,9 @@ class SME2_ZA_Tile_TwoVec_Pat<string name, SDPatternOperator intrinsic, Operand
: Pat<(intrinsic imm_ty:$tile, vt:$Zn, vt:$Zm),
(!cast<Instruction>(name # _PSEUDO) $tile, $Zn, $Zm)>;
+class SME2_ZA_Tile_Vec_Multi_Pat<string name, SDPatternOperator intrinsic, Operand imm_ty, ValueType vt>
+ : Pat<(intrinsic imm_ty:$tile, vt:$Zn, vt:$Zm1, vt:$Zm2),
+ (!cast<Instruction>(name # _PSEUDO) $tile, $Zn, (REG_SEQUENCE ZPR2Mul2, vt:$Zm1, zsub0, vt:$Zm2, zsub1))>;
//===----------------------------------------------------------------------===//
// SME pattern match helpers.
//===----------------------------------------------------------------------===//
@@ -623,7 +635,12 @@ multiclass sme_quarter_outer_product_i8_i32<bit zn_u, bit zm_u, bit subtr, strin
def _M2ZZ_BToS : sme_quarter_outer_product_i8_i32<{zn_u, 1}, {zm_u, 0}, subtr,
ZZ_b_mul_r_Lo, ZPR8Mul2_Hi, mnemonic>;
def _MZ2Z_BToS : sme_quarter_outer_product_i8_i32<{zn_u, 0}, {zm_u, 1}, subtr,
- ZPR8Mul2_Lo, ZZ_b_mul_r_Hi, mnemonic>;
+ ZPR8Mul2_Lo, ZZ_b_mul_r_Hi, mnemonic>, SMEPseudo2Instr<NAME # _MZ2Z_BToS, 1>;
+
+ def NAME # _MZ2Z_BToS # _PSEUDO : sme2_quarter_tile_outer_product_pseudo_single_multi<ZPR8Mul2_Lo, ZZ_b_mul_r_Hi, SMEMatrixTileS>, SMEPseudo2Instr<NAME # _MZ2Z_BToS, 0>;
+
+ def : SME2_ZA_Tile_Vec_Multi_Pat<NAME # _MZ2Z_BToS, !cast<SDPatternOperator>(op # "_1x2"), timm32_0_3, nxv16i8>;
+
def _M2Z2Z_BToS : sme_quarter_outer_product_i8_i32<{zn_u, 1}, {zm_u, 1}, subtr,
ZZ_b_mul_r_Lo, ZZ_b_mul_r_Hi, mnemonic>;
}
@@ -639,7 +656,12 @@ multiclass sme_quarter_outer_product_i16_i32<bit unsigned, bit subtr, string mne
def _M2ZZ_HToS : sme_quarter_outer_product_i16_i32<unsigned, 0b1, 0b0, subtr,
ZZ_h_mul_r_Lo, ZPR16Mul2_Hi, mnemonic>;
def _MZ2Z_HToS : sme_quarter_outer_product_i16_i32<unsigned, 0b0, 0b1, subtr,
- ZPR16Mul2_Lo, ZZ_h_mul_r_Hi, mnemonic>;
+ ZPR16Mul2_Lo, ZZ_h_mul_r_Hi, mnemonic>, SMEPseudo2Instr<NAME # _MZ2Z_HToS, 1>;
+
+ def NAME # _MZ2Z_HToS # _PSEUDO : sme2_quarter_tile_outer_product_pseudo_single_multi<ZPR16Mul2_Lo, ZZ_h_mul_r_Hi, SMEMatrixTileS>, SMEPseudo2Instr<NAME # _MZ2Z_HToS, 0>;
+
+ def : SME2_ZA_Tile_Vec_Multi_Pat<NAME # _MZ2Z_HToS, !cast<SDPatternOperator>(op # "_1x2"), timm32_0_3, nxv8i16>;
+
def _M2Z2Z_HToS : sme_quarter_outer_product_i16_i32<unsigned, 0b1, 0b1, subtr,
ZZ_h_mul_r_Lo, ZZ_h_mul_r_Hi, mnemonic>;
}
@@ -655,7 +677,12 @@ multiclass sme_quarter_outer_product_i64<bit zn_u, bit zm_u, bit subtr, string m
def _M2ZZ_HtoD : sme_quarter_outer_product_i64<{zn_u, 1}, {zm_u, 0}, subtr,
ZZ_h_mul_r_Lo, ZPR16Mul2_Hi, mnemonic>;
def _MZ2Z_HtoD : sme_quarter_outer_product_i64<{zn_u, 0}, {zm_u, 1}, subtr,
- ZPR16Mul2_Lo, ZZ_h_mul_r_Hi, mnemonic>;
+ ZPR16Mul2_Lo, ZZ_h_mul_r_Hi, mnemonic>, SMEPseudo2Instr<NAME # _MZ2Z_HtoD, 1>;
+
+ def NAME # _MZ2Z_HtoD # _PSEUDO : sme2_quarter_tile_outer_product_pseudo_single_multi<ZPR16Mul2_Lo, ZZ_h_mul_r_Hi, SMEMatrixTileD>, SMEPseudo2Instr<NAME # _MZ2Z_HtoD, 0>;
+
+ def : SME2_ZA_Tile_Vec_Multi_Pat<NAME # _MZ2Z_HtoD, !cast<SDPatternOperator>(op # "_1x2"), timm32_0_3, nxv8i16>;
+
def _M2Z2Z_HtoD : sme_quarter_outer_product_i64<{zn_u, 1}, {zm_u, 1}, subtr,
ZZ_h_mul_r_Lo, ZZ_h_mul_r_Hi, mnemonic>;
}
@@ -5509,7 +5536,12 @@ multiclass sme2_bfmop4as_widening<bit S, string mnemonic, string op> {
def _M2ZZ_S : sme2_bf16_fp32_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZPR16Mul2_Hi>;
// Single and multiple vectors
- def _MZ2Z_S : sme2_bf16_fp32_quarter_tile_outer_product<1, 0, S, mnemonic, ZPR16Mul2_Lo, ZZ_h_mul_r_Hi>;
+ def _MZ2Z_S : sme2_bf16_fp32_quarter_tile_outer_product<1, 0, S, mnemonic, ZPR16Mul2_Lo, ZZ_h_mul_r_Hi>, SMEPseudo2Instr<NAME # _MZ2Z_S, 1>;
+
+ def NAME # _MZ2Z_S # _PSEUDO : sme2_quarter_tile_outer_product_pseudo_single_multi<ZPR16Mul2_Lo, ZZ_h_mul_r_Hi, SMEMatrixTileS>, SMEPseudo2Instr<NAME # _MZ2Z_S, 0>;
+
+ def : SME2_ZA_Tile_Vec_Multi_Pat<NAME # _MZ2Z_S, !cast<SDPatternOperator>(op # "_1x2"), timm32_0_3, nxv8bf16>;
+
// Multiple vectors
def _M2Z2Z_S : sme2_bf16_fp32_quarter_tile_outer_product<1, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZZ_h_mul_r_Hi>;
@@ -5660,7 +5692,11 @@ multiclass sme2_fmop4as_fp16_non_widening<bit S, string mnemonic, string op> {
def _M2ZZ_H : sme2_fp16_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZPR16Mul2_Hi>;
// Single and multiple vectors
- def _MZ2Z_H : sme2_fp16_quarter_tile_outer_product<1, 0, S, mnemonic, ZPR16Mul2_Lo, ZZ_h_mul_r_Hi>;
+ def _MZ2Z_H : sme2_fp16_quarter_tile_outer_product<1, 0, S, mnemonic, ZPR16Mul2_Lo, ZZ_h_mul_r_Hi>, SMEPseudo2Instr<NAME # _MZ2Z_H, 1>;
+
+ def NAME # _MZ2Z_H # _PSEUDO : sme2_quarter_tile_outer_product_pseudo_single_multi<ZPR16Mul2_Lo, ZZ_h_mul_r_Hi, SMEMatrixTileH>, SMEPseudo2Instr<NAME # _MZ2Z_H, 0>;
+
+ def : SME2_ZA_Tile_Vec_Multi_Pat<NAME # _MZ2Z_H, !cast<SDPatternOperator>(op # "_1x2"), timm32_0_3, nxv8f16>;
// Multiple vectors
def _M2Z2Z_H : sme2_fp16_quarter_tile_outer_product<1, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZZ_h_mul_r_Hi>;
@@ -5736,7 +5772,11 @@ multiclass sme2_bfmop4as_non_widening<bit S, string mnemonic, string op> {
def _M2ZZ_H : sme2_bf16_fp16_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZPR16Mul2_Hi>;
// Single and multiple vectors
- def _MZ2Z_H : sme2_bf16_fp16_quarter_tile_outer_product<1, 0, S, mnemonic, ZPR16Mul2_Lo, ZZ_h_mul_r_Hi>;
+ def _MZ2Z_H : sme2_bf16_fp16_quarter_tile_outer_product<1, 0, S, mnemonic, ZPR16Mul2_Lo, ZZ_h_mul_r_Hi>, SMEPseudo2Instr<NAME # _MZ2Z_H, 1>;
+
+ def NAME # _MZ2Z_H # _PSEUDO : sme2_quarter_tile_outer_product_pseudo_single_multi<ZPR16Mul2_Lo, ZZ_h_mul_r_Hi, SMEMatrixTileH>, SMEPseudo2Instr<NAME # _MZ2Z_H, 0>;
+
+ def : SME2_ZA_Tile_Vec_Multi_Pat<NAME # _MZ2Z_H, !cast<SDPatternOperator>(op # "_1x2"), timm32_0_3, nxv8bf16>;
// Multiple vectors
def _M2Z2Z_H : sme2_bf16_fp16_quarter_tile_outer_product<1, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZZ_h_mul_r_Hi>;
@@ -5777,7 +5817,11 @@ multiclass sme2_fmop4as_fp32_non_widening<bit S, string mnemonic, string op> {
def _M2ZZ_S : sme2_fp32_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_s_mul_r_Lo, ZPR32Mul2_Hi>;
// Single and multiple vectors
- def _MZ2Z_S : sme2_fp32_quarter_tile_outer_product<1, 0, S, mnemonic, ZPR32Mul2_Lo, ZZ_s_mul_r_Hi>;
+ def _MZ2Z_S : sme2_fp32_quarter_tile_outer_product<1, 0, S, mnemonic, ZPR32Mul2_Lo, ZZ_s_mul_r_Hi>, SMEPseudo2Instr<NAME # _MZ2Z_S, 1>;
+
+ def NAME # _MZ2Z_S # _PSEUDO : sme2_quarter_tile_outer_product_pseudo_single_multi<ZPR32Mul2_Lo, ZZ_s_mul_r_Hi, SMEMatrixTileS>, SMEPseudo2Instr<NAME # _MZ2Z_S, 0>;
+
+ def : SME2_ZA_Tile_Vec_Multi_Pat<NAME # _MZ2Z_S, !cast<SDPatternOperator>(op # "_1x2"), timm32_0_3, nxv4f32>;
// Multiple vectors
def _M2Z2Z_S : sme2_fp32_quarter_tile_outer_product<1, 1, S, mnemonic, ZZ_s_mul_r_Lo, ZZ_s_mul_r_Hi>;
@@ -5818,7 +5862,11 @@ multiclass sme2_fmop4as_fp64_non_widening<bit S, string mnemonic, string op> {
def _M2ZZ_D : sme2_fp64_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_d_mul_r_Lo, ZPR64Mul2_Hi>;
// Single and multiple vectors
- def _MZ2Z_D : sme2_fp64_quarter_tile_outer_product<1, 0, S, mnemonic, ZPR64Mul2_Lo, ZZ_d_mul_r_Hi>;
+ def _MZ2Z_D : sme2_fp64_quarter_tile_outer_product<1, 0, S, mnemonic, ZPR64Mul2_Lo, ZZ_d_mul_r_Hi>, SMEPseudo2Instr<NAME # _MZ2Z_D, 1>;
+
+ def NAME # _MZ2Z_D # _PSEUDO : sme2_quarter_tile_outer_product_pseudo_single_multi<ZPR64Mul2_Lo, ZZ_d_mul_r_Hi, SMEMatrixTileD>, SMEPseudo2Instr<NAME # _MZ2Z_D, 0>;
+
+ def : SME2_ZA_Tile_Vec_Multi_Pat<NAME # _MZ2Z_D, !cast<SDPatternOperator>(op # "_1x2"), timm32_0_3, nxv2f64>;
// Multiple vectors
def _M2Z2Z_D : sme2_fp64_quarter_tile_outer_product<1, 1, S, mnemonic, ZZ_d_mul_r_Lo, ZZ_d_mul_r_Hi>;
@@ -5859,7 +5907,11 @@ multiclass sme2_fmop4as_fp16_fp32_widening<bit S, string mnemonic, string op> {
def _M2ZZ_HtoS : sme2_fp16_fp32_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZPR16Mul2_Hi>;
// Single and multiple vectors
- def _MZ2Z_HtoS : sme2_fp16_fp32_quarter_tile_outer_product<1, 0, S, mnemonic, ZPR16Mul2_Lo, ZZ_h_mul_r_Hi>;
+ def _MZ2Z_HtoS : sme2_fp16_fp32_quarter_tile_outer_product<1, 0, S, mnemonic, ZPR16Mul2_Lo, ZZ_h_mul_r_Hi>, SMEPseudo2Instr<NAME # _MZ2Z_HtoS, 1>;
+
+ def NAME # _MZ2Z_HtoS # _PSEUDO : sme2_quarter_tile_outer_product_pseudo_single_multi<ZPR16Mul2_Lo, ZZ_h_mul_r_Hi, SMEMatrixTileS>, SMEPseudo2Instr<NAME # _MZ2Z_HtoS, 0>;
+
+ def : SME2_ZA_Tile_Vec_Multi_Pat<NAME # _MZ2Z_HtoS, !cast<SDPatternOperator>(op # "_1x2"), timm32_0_3, nxv8f16>;
// Multiple vectors
def _M2Z2Z_HtoS : sme2_fp16_fp32_quarter_tile_outer_product<1, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZZ_h_mul_r_Hi>;
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_1x2.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_1x2.ll
new file mode 100644
index 0000000000000..6e2733b6c6b64
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_1x2.ll
@@ -0,0 +1,361 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -force-streaming -verify-machineinstrs < %s | FileCheck %s
+
+target triple = "aarch64-linux"
+
+; Widening
+define void @mop4a_za32_s8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2) #0 {
+; CHECK-LABEL: mop4a_za32_s8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z25.d, z2.d
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: smop4a za1.s, z0.b, { z24.b, z25.b }
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.smop4a.wide.1x2.nxv16i8(i32 1, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2)
+ ret void
+}
+
+define void @mop4s_za32_s8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2) #0 {
+; CHECK-LABEL: mop4s_za32_s8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z25.d, z2.d
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: smop4s za1.s, z0.b, { z24.b, z25.b }
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.smop4s.wide.1x2.nxv16i8(i32 1, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2)
+ ret void
+}
+
+define void @mop4a_za32_u8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2) #0 {
+; CHECK-LABEL: mop4a_za32_u8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z25.d, z2.d
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: umop4a za1.s, z0.b, { z24.b, z25.b }
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.umop4a.wide.1x2.nxv16i8(i32 1, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2)
+ ret void
+}
+
+define void @mop4s_za32_u8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2) #0 {
+; CHECK-LABEL: mop4s_za32_u8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z25.d, z2.d
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: umop4s za1.s, z0.b, { z24.b, z25.b }
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.umop4s.wide.1x2.nxv16i8(i32 1, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2)
+ ret void
+}
+
+define void @mop4a_za32_s8_u8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2) #0 {
+; CHECK-LABEL: mop4a_za32_s8_u8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z25.d, z2.d
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: sumop4a za1.s, z0.b, { z24.b, z25.b }
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.sumop4a.wide.1x2.nxv16i8(i32 1, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2)
+ ret void
+}
+
+define void @mop4s_za32_s8_u8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2) #0 {
+; CHECK-LABEL: mop4s_za32_s8_u8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z25.d, z2.d
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: sumop4s za1.s, z0.b, { z24.b, z25.b }
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.sumop4s.wide.1x2.nxv16i8(i32 1, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2)
+ ret void
+}
+
+define void @mop4a_za32_u8_s8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2) #0 {
+; CHECK-LABEL: mop4a_za32_u8_s8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z25.d, z2.d
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: usmop4a za1.s, z0.b, { z24.b, z25.b }
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.usmop4a.wide.1x2.nxv16i8(i32 1, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2)
+ ret void
+}
+
+define void @mop4s_za32_u8_s8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2) #0 {
+; CHECK-LABEL: mop4s_za32_u8_s8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z25.d, z2.d
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: usmop4s za1.s, z0.b, { z24.b, z25.b }
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.usmop4s.wide.1x2.nxv16i8(i32 1, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2)
+ ret void
+}
+
+
+define void @mop4a_za32_s16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2) #0 {
+; CHECK-LABEL: mop4a_za32_s16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z25.d, z2.d
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: smop4a za1.s, z0.h, { z24.h, z25.h }
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.smop4a.wide.1x2.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2)
+ ret void
+}
+
+define void @mop4s_za32_s16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2) #0 {
+; CHECK-LABEL: mop4s_za32_s16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z25.d, z2.d
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: smop4s za1.s, z0.h, { z24.h, z25.h }
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.smop4s.wide.1x2.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2)
+ ret void
+}
+
+define void @mop4a_za32_u16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2) #0 {
+; CHECK-LABEL: mop4a_za32_u16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z25.d, z2.d
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: umop4a za1.s, z0.h, { z24.h, z25.h }
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.umop4a.wide.1x2.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2)
+ ret void
+}
+
+define void @mop4s_za32_u16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2) #0 {
+; CHECK-LABEL: mop4s_za32_u16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z25.d, z2.d
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: umop4s za1.s, z0.h, { z24.h, z25.h }
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.umop4s.wide.1x2.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2)
+ ret void
+}
+
+define void @mop4a_za32_f16(<vscale x 8 x half> %zn, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2) #0 {
+; CHECK-LABEL: mop4a_za32_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z25.d, z2.d
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: fmop4a za1.s, z0.h, { z24.h, z25.h }
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4a.wide.1x2.nxv8f16(i32 1, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2)
+ ret void
+}
+
+define void @mop4s_za32_f16(<vscale x 8 x half> %zn, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2) #0 {
+; CHECK-LABEL: mop4s_za32_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z25.d, z2.d
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: fmop4s za1.s, z0.h, { z24.h, z25.h }
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4s.wide.1x2.nxv8f16(i32 1, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2)
+ ret void
+}
+
+define void @mop4a_za32_bf16(<vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm1, <vscale x 8 x bfloat> %zm2) #0 {
+; CHECK-LABEL: mop4a_za32_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z25.d, z2.d
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: bfmop4a za1.s, z0.h, { z24.h, z25.h }
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4a.wide.1x2.nxv8bf16(i32 1, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm1, <vscale x 8 x bfloat> %zm2)
+ ret void
+}
+
+define void @mop4s_za32_bf16(<vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm1, <vscale x 8 x bfloat> %zm2) #0 {
+; CHECK-LABEL: mop4s_za32_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z25.d, z2.d
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: bfmop4s za1.s, z0.h, { z24.h, z25.h }
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4s.wide.1x2.nxv8bf16(i32 1, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm1, <vscale x 8 x bfloat> %zm2)
+ ret void
+}
+
+define void @mop4a_za64_s16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2) #0 {
+; CHECK-LABEL: mop4a_za64_s16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z25.d, z2.d
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: smop4a za1.d, z0.h, { z24.h, z25.h }
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.smop4a.za64.wide.1x2.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2)
+ ret void
+}
+
+define void @mop4s_za64_s16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2) #0 {
+; CHECK-LABEL: mop4s_za64_s16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z25.d, z2.d
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: smop4s za1.d, z0.h, { z24.h, z25.h }
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.smop4s.za64.wide.1x2.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2)
+ ret void
+}
+
+define void @mop4a_za64_u16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2) #0 {
+; CHECK-LABEL: mop4a_za64_u16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z25.d, z2.d
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: umop4a za1.d, z0.h, { z24.h, z25.h }
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.umop4a.za64.wide.1x2.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2)
+ ret void
+}
+
+define void @mop4s_za64_u16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2) #0 {
+; CHECK-LABEL: mop4s_za64_u16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z25.d, z2.d
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: umop4s za1.d, z0.h, { z24.h, z25.h }
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.umop4s.za64.wide.1x2.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2)
+ ret void
+}
+
+define void @mop4a_za64_s16_u16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2) #0 {
+; CHECK-LABEL: mop4a_za64_s16_u16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z25.d, z2.d
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: sumop4a za1.d, z0.h, { z24.h, z25.h }
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.sumop4a.za64.wide.1x2.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2)
+ ret void
+}
+
+define void @mop4s_za64_s16_u16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2) #0 {
+; CHECK-LABEL: mop4s_za64_s16_u16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z25.d, z2.d
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: sumop4s za1.d, z0.h, { z24.h, z25.h }
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.sumop4s.za64.wide.1x2.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2)
+ ret void
+}
+
+define void @mop4a_za64_u16_s16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2) #0 {
+; CHECK-LABEL: mop4a_za64_u16_s16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z25.d, z2.d
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: usmop4a za1.d, z0.h, { z24.h, z25.h }
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.usmop4a.za64.wide.1x2.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2)
+ ret void
+}
+
+define void @mop4s_za64_u16_s16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2) #0 {
+; CHECK-LABEL: mop4s_za64_u16_s16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z25.d, z2.d
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: usmop4s za1.d, z0.h, { z24.h, z25.h }
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.usmop4s.za64.wide.1x2.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2)
+ ret void
+}
+
+; Non-widening
+define void @mop4a_za16_f16(<vscale x 8 x half> %zn, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2) #0 {
+; CHECK-LABEL: mop4a_za16_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z25.d, z2.d
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: fmop4a za1.h, z0.h, { z24.h, z25.h }
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4a.1x2.nxv8f16(i32 1, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2)
+ ret void
+}
+
+define void @mop4s_za16_f16(<vscale x 8 x half> %zn, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2) #0 {
+; CHECK-LABEL: mop4s_za16_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z25.d, z2.d
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: fmop4s za1.h, z0.h, { z24.h, z25.h }
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4s.1x2.nxv8f16(i32 1, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2)
+ ret void
+}
+
+define void @mop4a_za32_f32(<vscale x 4 x float> %zn, <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2) #0 {
+; CHECK-LABEL: mop4a_za32_f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z25.d, z2.d
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: fmop4a za1.s, z0.s, { z24.s, z25.s }
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4a.1x2.nxv4f32(i32 1, <vscale x 4 x float> %zn, <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2)
+ ret void
+}
+
+define void @mop4s_za32_f32(<vscale x 4 x float> %zn, <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2) #0 {
+; CHECK-LABEL: mop4s_za32_f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z25.d, z2.d
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: fmop4s za1.s, z0.s, { z24.s, z25.s }
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4s.1x2.nxv4f32(i32 1, <vscale x 4 x float> %zn, <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2)
+ ret void
+}
+
+define void @mop4a_za64_f64(<vscale x 2 x double> %zn, <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2) #0 {
+; CHECK-LABEL: mop4a_za64_f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z25.d, z2.d
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: fmop4a za1.d, z0.d, { z24.d, z25.d }
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4a.1x2.nxv2f64(i32 1, <vscale x 2 x double> %zn, <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2)
+ ret void
+}
+
+define void @mop4s_za64_f64(<vscale x 2 x double> %zn, <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2) #0 {
+; CHECK-LABEL: mop4s_za64_f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z25.d, z2.d
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: fmop4s za1.d, z0.d, { z24.d, z25.d }
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4s.1x2.nxv2f64(i32 1, <vscale x 2 x double> %zn, <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2)
+ ret void
+}
+
+define void @mop4a_za16_bf16(<vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm1, <vscale x 8 x bfloat> %zm2) #0 {
+; CHECK-LABEL: mop4a_za16_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z25.d, z2.d
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: bfmop4a za1.h, z0.h, { z24.h, z25.h }
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4a.1x2.nxv8bf16(i32 1, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm1, <vscale x 8 x bfloat> %zm2)
+ ret void
+}
+
+define void @mop4s_za16_bf16(<vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm1, <vscale x 8 x bfloat> %zm2) #0 {
+; CHECK-LABEL: mop4s_za16_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z25.d, z2.d
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: bfmop4s za1.h, z0.h, { z24.h, z25.h }
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4s.1x2.nxv8bf16(i32 1, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm1, <vscale x 8 x bfloat> %zm2)
+ ret void
+}
+
+attributes #0 = {nounwind "target-features" = "+sme-i16i64,+sme-f64f64,+sme-b16b16,+sme2p1,+bf16,+sme-f16f16,+sme-mop4" }
>From ebd7008a62acab8a1dd41b4369cfcdcd64f22a93 Mon Sep 17 00:00:00 2001
From: Virginia Cangelosi <virginia.cangelosi at arm.com>
Date: Wed, 26 Feb 2025 10:47:50 +0000
Subject: [PATCH 09/11] Fix clang format issues
---
clang/utils/TableGen/SveEmitter.cpp | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp
index 63452d26654be..200f57960fff8 100644
--- a/clang/utils/TableGen/SveEmitter.cpp
+++ b/clang/utils/TableGen/SveEmitter.cpp
@@ -1043,7 +1043,8 @@ std::string Intrinsic::replaceTemplatedArgs(std::string Name, TypeSpec TS,
case '1':
case '2':
case '3':
- // Extract the modifier before passing to SVEType to handle numeric modifiers
+ // Extract the modifier before passing to SVEType to handle numeric
+ // modifiers
auto [Mod, NumVectors] = getProtoModifier(Proto, (C - '0'));
T = SVEType(TS, Mod);
break;
>From d3367db0b1f9aed3677279948544050530116797 Mon Sep 17 00:00:00 2001
From: Virginia Cangelosi <virginia.cangelosi at arm.com>
Date: Wed, 26 Feb 2025 15:37:15 +0000
Subject: [PATCH 10/11] Fix immediates and tests for 1x2 mop4
---
clang/include/clang/Basic/arm_sme.td | 9 +
.../sme2-intrinsics/acle_sme2_mop4_1x2.c | 192 +++++++--------
llvm/lib/Target/AArch64/SMEInstrFormats.td | 8 +-
.../AArch64/sme2-intrinsics-mop4a_1x2.ll | 225 +++++++++++++-----
4 files changed, 272 insertions(+), 162 deletions(-)
diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index ff42f110f72cb..b8fbfae3060d3 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -378,6 +378,7 @@ let SMETargetGuard = "sme2" in {
multiclass MOP4<string name, string n, string t, string i, string wide, list<ImmCheck> checks> {
def NAME # "_1x1" : Inst<"svmop4" # name # "_1x1_" # n # "[_{d}_{d}]", "vidd", t, MergeNone, i # wide # "_1x1", [IsInOutZA, IsStreaming], checks>;
+ def NAME # "_1x2" : Inst<"svmop4" # name # "_1x2_" # n # "[_{d}_{d}]", "vid2", t, MergeNone, i # wide # "_1x2", [IsInOutZA, IsStreaming], checks>;
}
multiclass SUMOP4<string s, string za, string t, string i, list<ImmCheck> checks> {
@@ -385,6 +386,10 @@ multiclass SUMOP4<string s, string za, string t, string i, list<ImmCheck> checks
"vidu", t, MergeNone, "aarch64_sme_sumop4" # s # i # "_wide_1x1",
[IsStreaming, IsInOutZA],
checks>;
+ def _1x2 : SInst<"svmop4" # s # "[_1x2_]" # za # "[_{d}_{3}]",
+ "vid2.u", t, MergeNone, "aarch64_sme_sumop4" # s # i # "_wide_1x2",
+ [IsStreaming, IsInOutZA],
+ checks>;
}
multiclass USMOP4<string s, string za, string t, string i, list<ImmCheck> checks> {
@@ -392,6 +397,10 @@ multiclass USMOP4<string s, string za, string t, string i, list<ImmCheck> checks
"vidx", t, MergeNone, "aarch64_sme_usmop4" # s # i # "_wide_1x1",
[IsStreaming, IsInOutZA],
checks>;
+ def _1x2 : SInst<"svmop4" # s # "[_1x2_]" # za # "[_{d}_{3}]",
+ "vid2.x", t, MergeNone, "aarch64_sme_usmop4" # s # i # "_wide_1x2",
+ [IsStreaming, IsInOutZA],
+ checks>;
}
let SMETargetGuard = "sme2" in {
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_1x2.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_1x2.c
index 624fcd9281d83..17e768a1b61f8 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_1x2.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_1x2.c
@@ -18,449 +18,449 @@
// CHECK-LABEL: @test_svmop4a_1x2_za32_s8_s8(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.wide.1x2.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.wide.1x2.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z27test_svmop4a_1x2_za32_s8_s8u10__SVInt8_t10svint8x2_t(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.wide.1x2.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.wide.1x2.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_1x2_za32_s8_s8(svint8_t zn, svint8x2_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4a_1x2_za32,_s8_s8,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4a_1x2_za32,_s8_s8,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4s_1x2_za32_s8_s8(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.wide.1x2.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.wide.1x2.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z27test_svmop4s_1x2_za32_s8_s8u10__SVInt8_t10svint8x2_t(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.wide.1x2.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.wide.1x2.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4s_1x2_za32_s8_s8(svint8_t zn, svint8x2_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4s_1x2_za32,_s8_s8,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4s_1x2_za32,_s8_s8,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4a_1x2_za32_u8_u8(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.wide.1x2.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.wide.1x2.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z27test_svmop4a_1x2_za32_u8_u8u11__SVUint8_t11svuint8x2_t(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.wide.1x2.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.wide.1x2.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_1x2_za32_u8_u8(svuint8_t zn, svuint8x2_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4a_1x2_za32,_u8_u8,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4a_1x2_za32,_u8_u8,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4s_1x2_za32_u8_u8(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.wide.1x2.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.wide.1x2.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z27test_svmop4s_1x2_za32_u8_u8u11__SVUint8_t11svuint8x2_t(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.wide.1x2.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.wide.1x2.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4s_1x2_za32_u8_u8(svuint8_t zn, svuint8x2_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4s_1x2_za32,_u8_u8,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4s_1x2_za32,_u8_u8,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4a_1x2_za32_s8_u8(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4a.wide.1x2.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4a.wide.1x2.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z27test_svmop4a_1x2_za32_s8_u8u10__SVInt8_t11svuint8x2_t(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4a.wide.1x2.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4a.wide.1x2.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_1x2_za32_s8_u8(svint8_t zn, svuint8x2_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4a_1x2_za32,_s8_u8,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4a_1x2_za32,_s8_u8,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4s_1x2_za32_s8_u8(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4s.wide.1x2.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4s.wide.1x2.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z27test_svmop4s_1x2_za32_s8_u8u10__SVInt8_t11svuint8x2_t(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4s.wide.1x2.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4s.wide.1x2.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4s_1x2_za32_s8_u8(svint8_t zn, svuint8x2_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4s_1x2_za32,_s8_u8,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4s_1x2_za32,_s8_u8,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4a_1x2_za32_u8_s8(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4a.wide.1x2.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4a.wide.1x2.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z27test_svmop4a_1x2_za32_u8_s8u11__SVUint8_t10svint8x2_t(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4a.wide.1x2.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4a.wide.1x2.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_1x2_za32_u8_s8(svuint8_t zn, svint8x2_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4a_1x2_za32,_u8_s8,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4a_1x2_za32,_u8_s8,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4s_1x2_za32_u8_s8(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4s.wide.1x2.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4s.wide.1x2.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z27test_svmop4s_1x2_za32_u8_s8u11__SVUint8_t10svint8x2_t(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4s.wide.1x2.nxv16i8(i32 3, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4s.wide.1x2.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4s_1x2_za32_u8_s8(svuint8_t zn, svint8x2_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4s_1x2_za32,_u8_s8,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4s_1x2_za32,_u8_s8,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4a_1x2_za32_s16_s16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.wide.1x2.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.wide.1x2.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x2_za32_s16_s16u11__SVInt16_t11svint16x2_t(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.wide.1x2.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.wide.1x2.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_1x2_za32_s16_s16(svint16_t zn, svint16x2_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4a_1x2_za32,_s16_s16,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4a_1x2_za32,_s16_s16,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4s_1x2_za32_s16_s16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.wide.1x2.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.wide.1x2.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x2_za32_s16_s16u11__SVInt16_t11svint16x2_t(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.wide.1x2.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.wide.1x2.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4s_1x2_za32_s16_s16(svint16_t zn, svint16x2_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4s_1x2_za32,_s16_s16,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4s_1x2_za32,_s16_s16,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4a_1x2_za32_u16_u16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.wide.1x2.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.wide.1x2.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x2_za32_u16_u16u12__SVUint16_t12svuint16x2_t(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.wide.1x2.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.wide.1x2.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_1x2_za32_u16_u16(svuint16_t zn, svuint16x2_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4a_1x2_za32,_u16_u16,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4a_1x2_za32,_u16_u16,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4s_1x2_za32_u16_u16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.wide.1x2.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.wide.1x2.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x2_za32_u16_u16u12__SVUint16_t12svuint16x2_t(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.wide.1x2.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.wide.1x2.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4s_1x2_za32_u16_u16(svuint16_t zn, svuint16x2_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4s_1x2_za32,_u16_u16,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4s_1x2_za32,_u16_u16,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4a_1x2_za32_f16_f16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x2.nxv8f16(i32 3, <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM_COERCE0:%.*]], <vscale x 8 x half> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x2.nxv8f16(i32 1, <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM_COERCE0:%.*]], <vscale x 8 x half> [[ZM_COERCE1:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x2_za32_f16_f16u13__SVFloat16_t13svfloat16x2_t(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x2.nxv8f16(i32 3, <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM_COERCE0:%.*]], <vscale x 8 x half> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x2.nxv8f16(i32 1, <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM_COERCE0:%.*]], <vscale x 8 x half> [[ZM_COERCE1:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_1x2_za32_f16_f16(svfloat16_t zn, svfloat16x2_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4a_1x2_za32,_f16_f16,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4a_1x2_za32,_f16_f16,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4s_1x2_za32_f16_f16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x2.nxv8f16(i32 3, <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM_COERCE0:%.*]], <vscale x 8 x half> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x2.nxv8f16(i32 1, <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM_COERCE0:%.*]], <vscale x 8 x half> [[ZM_COERCE1:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x2_za32_f16_f16u13__SVFloat16_t13svfloat16x2_t(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x2.nxv8f16(i32 3, <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM_COERCE0:%.*]], <vscale x 8 x half> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x2.nxv8f16(i32 1, <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM_COERCE0:%.*]], <vscale x 8 x half> [[ZM_COERCE1:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4s_1x2_za32_f16_f16(svfloat16_t zn, svfloat16x2_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4s_1x2_za32,_f16_f16,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4s_1x2_za32,_f16_f16,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4a_1x2_za32_bf16_bf16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x2.nxv8bf16(i32 3, <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x2.nxv8bf16(i32 1, <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z31test_svmop4a_1x2_za32_bf16_bf16u14__SVBfloat16_t14svbfloat16x2_t(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x2.nxv8bf16(i32 3, <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x2.nxv8bf16(i32 1, <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_1x2_za32_bf16_bf16(svbfloat16_t zn, svbfloat16x2_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4a_1x2_za32,_bf16_bf16,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4a_1x2_za32,_bf16_bf16,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4s_1x2_za32_bf16_bf16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x2.nxv8bf16(i32 3, <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x2.nxv8bf16(i32 1, <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z31test_svmop4s_1x2_za32_bf16_bf16u14__SVBfloat16_t14svbfloat16x2_t(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x2.nxv8bf16(i32 3, <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x2.nxv8bf16(i32 1, <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4s_1x2_za32_bf16_bf16(svbfloat16_t zn, svbfloat16x2_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4s_1x2_za32,_bf16_bf16,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4s_1x2_za32,_bf16_bf16,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4a_1x2_za64_s16_s16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.za64.wide.1x2.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.za64.wide.1x2.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x2_za64_s16_s16u11__SVInt16_t11svint16x2_t(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.za64.wide.1x2.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.za64.wide.1x2.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_1x2_za64_s16_s16(svint16_t zn, svint16x2_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4a_1x2_za64,_s16_s16,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4a_1x2_za64,_s16_s16,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4s_1x2_za64_s16_s16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.za64.wide.1x2.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.za64.wide.1x2.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x2_za64_s16_s16u11__SVInt16_t11svint16x2_t(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.za64.wide.1x2.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.za64.wide.1x2.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4s_1x2_za64_s16_s16(svint16_t zn, svint16x2_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4s_1x2_za64,_s16_s16,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4s_1x2_za64,_s16_s16,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4a_1x2_za64_u16_u16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.za64.wide.1x2.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.za64.wide.1x2.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x2_za64_u16_u16u12__SVUint16_t12svuint16x2_t(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.za64.wide.1x2.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.za64.wide.1x2.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_1x2_za64_u16_u16(svuint16_t zn, svuint16x2_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4a_1x2_za64,_u16_u16,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4a_1x2_za64,_u16_u16,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4s_1x2_za64_u16_u16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.za64.wide.1x2.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.za64.wide.1x2.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x2_za64_u16_u16u12__SVUint16_t12svuint16x2_t(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.za64.wide.1x2.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.za64.wide.1x2.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4s_1x2_za64_u16_u16(svuint16_t zn, svuint16x2_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4s_1x2_za64,_u16_u16,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4s_1x2_za64,_u16_u16,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4a_1x2_za64_s16_u16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4a.za64.wide.1x2.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4a.za64.wide.1x2.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x2_za64_s16_u16u11__SVInt16_t12svuint16x2_t(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4a.za64.wide.1x2.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4a.za64.wide.1x2.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_1x2_za64_s16_u16(svint16_t zn, svuint16x2_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4a_1x2_za64,_s16_u16,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4a_1x2_za64,_s16_u16,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4s_1x2_za64_s16_u16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4s.za64.wide.1x2.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4s.za64.wide.1x2.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x2_za64_s16_u16u11__SVInt16_t12svuint16x2_t(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4s.za64.wide.1x2.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4s.za64.wide.1x2.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4s_1x2_za64_s16_u16(svint16_t zn, svuint16x2_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4s_1x2_za64,_s16_u16,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4s_1x2_za64,_s16_u16,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4a_1x2_za64_u16_s16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4a.za64.wide.1x2.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4a.za64.wide.1x2.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x2_za64_u16_s16u12__SVUint16_t11svint16x2_t(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4a.za64.wide.1x2.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4a.za64.wide.1x2.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_1x2_za64_u16_s16(svuint16_t zn, svint16x2_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4a_1x2_za64,_u16_s16,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4a_1x2_za64,_u16_s16,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4s_1x2_za64_u16_s16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4s.za64.wide.1x2.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4s.za64.wide.1x2.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x2_za64_u16_s16u12__SVUint16_t11svint16x2_t(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4s.za64.wide.1x2.nxv8i16(i32 3, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4s.za64.wide.1x2.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4s_1x2_za64_u16_s16(svuint16_t zn, svint16x2_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4s_1x2_za64,_u16_s16,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4s_1x2_za64,_u16_s16,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4a_1x2_za16_f16_f16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x2.nxv8f16(i32 3, <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM_COERCE0:%.*]], <vscale x 8 x half> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x2.nxv8f16(i32 1, <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM_COERCE0:%.*]], <vscale x 8 x half> [[ZM_COERCE1:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x2_za16_f16_f16u13__SVFloat16_t13svfloat16x2_t(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x2.nxv8f16(i32 3, <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM_COERCE0:%.*]], <vscale x 8 x half> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x2.nxv8f16(i32 1, <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM_COERCE0:%.*]], <vscale x 8 x half> [[ZM_COERCE1:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_1x2_za16_f16_f16(svfloat16_t zn, svfloat16x2_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4a_1x2_za16,_f16_f16,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4a_1x2_za16,_f16_f16,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4s_1x2_za16_f16_f16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x2.nxv8f16(i32 3, <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM_COERCE0:%.*]], <vscale x 8 x half> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x2.nxv8f16(i32 1, <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM_COERCE0:%.*]], <vscale x 8 x half> [[ZM_COERCE1:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x2_za16_f16_f16u13__SVFloat16_t13svfloat16x2_t(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x2.nxv8f16(i32 3, <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM_COERCE0:%.*]], <vscale x 8 x half> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x2.nxv8f16(i32 1, <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM_COERCE0:%.*]], <vscale x 8 x half> [[ZM_COERCE1:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4s_1x2_za16_f16_f16(svfloat16_t zn, svfloat16x2_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4s_1x2_za16,_f16_f16,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4s_1x2_za16,_f16_f16,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4a_1x2_za32_f32_f32(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x2.nxv4f32(i32 3, <vscale x 4 x float> [[ZN:%.*]], <vscale x 4 x float> [[ZM_COERCE0:%.*]], <vscale x 4 x float> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x2.nxv4f32(i32 1, <vscale x 4 x float> [[ZN:%.*]], <vscale x 4 x float> [[ZM_COERCE0:%.*]], <vscale x 4 x float> [[ZM_COERCE1:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x2_za32_f32_f32u13__SVFloat32_t13svfloat32x2_t(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x2.nxv4f32(i32 3, <vscale x 4 x float> [[ZN:%.*]], <vscale x 4 x float> [[ZM_COERCE0:%.*]], <vscale x 4 x float> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x2.nxv4f32(i32 1, <vscale x 4 x float> [[ZN:%.*]], <vscale x 4 x float> [[ZM_COERCE0:%.*]], <vscale x 4 x float> [[ZM_COERCE1:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_1x2_za32_f32_f32(svfloat32_t zn, svfloat32x2_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4a_1x2_za32,_f32_f32,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4a_1x2_za32,_f32_f32,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4s_1x2_za32_f32_f32(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x2.nxv4f32(i32 3, <vscale x 4 x float> [[ZN:%.*]], <vscale x 4 x float> [[ZM_COERCE0:%.*]], <vscale x 4 x float> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x2.nxv4f32(i32 1, <vscale x 4 x float> [[ZN:%.*]], <vscale x 4 x float> [[ZM_COERCE0:%.*]], <vscale x 4 x float> [[ZM_COERCE1:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x2_za32_f32_f32u13__SVFloat32_t13svfloat32x2_t(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x2.nxv4f32(i32 3, <vscale x 4 x float> [[ZN:%.*]], <vscale x 4 x float> [[ZM_COERCE0:%.*]], <vscale x 4 x float> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x2.nxv4f32(i32 1, <vscale x 4 x float> [[ZN:%.*]], <vscale x 4 x float> [[ZM_COERCE0:%.*]], <vscale x 4 x float> [[ZM_COERCE1:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4s_1x2_za32_f32_f32(svfloat32_t zn, svfloat32x2_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4s_1x2_za32,_f32_f32,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4s_1x2_za32,_f32_f32,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4a_1x2_za64_f64_f64(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x2.nxv2f64(i32 3, <vscale x 2 x double> [[ZN:%.*]], <vscale x 2 x double> [[ZM_COERCE0:%.*]], <vscale x 2 x double> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x2.nxv2f64(i32 1, <vscale x 2 x double> [[ZN:%.*]], <vscale x 2 x double> [[ZM_COERCE0:%.*]], <vscale x 2 x double> [[ZM_COERCE1:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x2_za64_f64_f64u13__SVFloat64_t13svfloat64x2_t(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x2.nxv2f64(i32 3, <vscale x 2 x double> [[ZN:%.*]], <vscale x 2 x double> [[ZM_COERCE0:%.*]], <vscale x 2 x double> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x2.nxv2f64(i32 1, <vscale x 2 x double> [[ZN:%.*]], <vscale x 2 x double> [[ZM_COERCE0:%.*]], <vscale x 2 x double> [[ZM_COERCE1:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_1x2_za64_f64_f64(svfloat64_t zn, svfloat64x2_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4a_1x2_za64,_f64_f64,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4a_1x2_za64,_f64_f64,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4s_1x2_za64_f64_f64(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x2.nxv2f64(i32 3, <vscale x 2 x double> [[ZN:%.*]], <vscale x 2 x double> [[ZM_COERCE0:%.*]], <vscale x 2 x double> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x2.nxv2f64(i32 1, <vscale x 2 x double> [[ZN:%.*]], <vscale x 2 x double> [[ZM_COERCE0:%.*]], <vscale x 2 x double> [[ZM_COERCE1:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x2_za64_f64_f64u13__SVFloat64_t13svfloat64x2_t(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x2.nxv2f64(i32 3, <vscale x 2 x double> [[ZN:%.*]], <vscale x 2 x double> [[ZM_COERCE0:%.*]], <vscale x 2 x double> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x2.nxv2f64(i32 1, <vscale x 2 x double> [[ZN:%.*]], <vscale x 2 x double> [[ZM_COERCE0:%.*]], <vscale x 2 x double> [[ZM_COERCE1:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4s_1x2_za64_f64_f64(svfloat64_t zn, svfloat64x2_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4s_1x2_za64,_f64_f64,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4s_1x2_za64,_f64_f64,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4a_1x2_za16_bf16_bf16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x2.nxv8bf16(i32 3, <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x2.nxv8bf16(i32 1, <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z31test_svmop4a_1x2_za16_bf16_bf16u14__SVBfloat16_t14svbfloat16x2_t(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x2.nxv8bf16(i32 3, <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x2.nxv8bf16(i32 1, <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_1x2_za16_bf16_bf16(svbfloat16_t zn, svbfloat16x2_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4a_1x2_za16,_bf16_bf16,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4a_1x2_za16,_bf16_bf16,)(1, zn, zm);
}
// CHECK-LABEL: @test_svmop4s_1x2_za16_bf16_bf16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x2.nxv8bf16(i32 3, <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x2.nxv8bf16(i32 1, <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z31test_svmop4s_1x2_za16_bf16_bf16u14__SVBfloat16_t14svbfloat16x2_t(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x2.nxv8bf16(i32 3, <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x2.nxv8bf16(i32 1, <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4s_1x2_za16_bf16_bf16(svbfloat16_t zn, svbfloat16x2_t zm) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4s_1x2_za16,_bf16_bf16,)(3, zn, zm);
+ SME_ACLE_FUNC(svmop4s_1x2_za16,_bf16_bf16,)(1, zn, zm);
}
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index 8ac49fc2dedfb..92088da1873a7 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -681,7 +681,7 @@ multiclass sme_quarter_outer_product_i64<bit zn_u, bit zm_u, bit subtr, string m
def NAME # _MZ2Z_HtoD # _PSEUDO : sme2_quarter_tile_outer_product_pseudo_single_multi<ZPR16Mul2_Lo, ZZ_h_mul_r_Hi, SMEMatrixTileD>, SMEPseudo2Instr<NAME # _MZ2Z_HtoD, 0>;
- def : SME2_ZA_Tile_Vec_Multi_Pat<NAME # _MZ2Z_HtoD, !cast<SDPatternOperator>(op # "_1x2"), timm32_0_3, nxv8i16>;
+ def : SME2_ZA_Tile_Vec_Multi_Pat<NAME # _MZ2Z_HtoD, !cast<SDPatternOperator>(op # "_1x2"), timm32_0_7, nxv8i16>;
def _M2Z2Z_HtoD : sme_quarter_outer_product_i64<{zn_u, 1}, {zm_u, 1}, subtr,
ZZ_h_mul_r_Lo, ZZ_h_mul_r_Hi, mnemonic>;
@@ -5696,7 +5696,7 @@ multiclass sme2_fmop4as_fp16_non_widening<bit S, string mnemonic, string op> {
def NAME # _MZ2Z_H # _PSEUDO : sme2_quarter_tile_outer_product_pseudo_single_multi<ZPR16Mul2_Lo, ZZ_h_mul_r_Hi, SMEMatrixTileH>, SMEPseudo2Instr<NAME # _MZ2Z_H, 0>;
- def : SME2_ZA_Tile_Vec_Multi_Pat<NAME # _MZ2Z_H, !cast<SDPatternOperator>(op # "_1x2"), timm32_0_3, nxv8f16>;
+ def : SME2_ZA_Tile_Vec_Multi_Pat<NAME # _MZ2Z_H, !cast<SDPatternOperator>(op # "_1x2"), timm32_0_1, nxv8f16>;
// Multiple vectors
def _M2Z2Z_H : sme2_fp16_quarter_tile_outer_product<1, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZZ_h_mul_r_Hi>;
@@ -5776,7 +5776,7 @@ multiclass sme2_bfmop4as_non_widening<bit S, string mnemonic, string op> {
def NAME # _MZ2Z_H # _PSEUDO : sme2_quarter_tile_outer_product_pseudo_single_multi<ZPR16Mul2_Lo, ZZ_h_mul_r_Hi, SMEMatrixTileH>, SMEPseudo2Instr<NAME # _MZ2Z_H, 0>;
- def : SME2_ZA_Tile_Vec_Multi_Pat<NAME # _MZ2Z_H, !cast<SDPatternOperator>(op # "_1x2"), timm32_0_3, nxv8bf16>;
+ def : SME2_ZA_Tile_Vec_Multi_Pat<NAME # _MZ2Z_H, !cast<SDPatternOperator>(op # "_1x2"), timm32_0_1, nxv8bf16>;
// Multiple vectors
def _M2Z2Z_H : sme2_bf16_fp16_quarter_tile_outer_product<1, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZZ_h_mul_r_Hi>;
@@ -5866,7 +5866,7 @@ multiclass sme2_fmop4as_fp64_non_widening<bit S, string mnemonic, string op> {
def NAME # _MZ2Z_D # _PSEUDO : sme2_quarter_tile_outer_product_pseudo_single_multi<ZPR64Mul2_Lo, ZZ_d_mul_r_Hi, SMEMatrixTileD>, SMEPseudo2Instr<NAME # _MZ2Z_D, 0>;
- def : SME2_ZA_Tile_Vec_Multi_Pat<NAME # _MZ2Z_D, !cast<SDPatternOperator>(op # "_1x2"), timm32_0_3, nxv2f64>;
+ def : SME2_ZA_Tile_Vec_Multi_Pat<NAME # _MZ2Z_D, !cast<SDPatternOperator>(op # "_1x2"), timm32_0_7, nxv2f64>;
// Multiple vectors
def _M2Z2Z_D : sme2_fp64_quarter_tile_outer_product<1, 1, S, mnemonic, ZZ_d_mul_r_Lo, ZZ_d_mul_r_Hi>;
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_1x2.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_1x2.ll
index 6e2733b6c6b64..f3540458dcaa6 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_1x2.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_1x2.ll
@@ -9,9 +9,9 @@ define void @mop4a_za32_s8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm1, <vsc
; CHECK: // %bb.0:
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: smop4a za1.s, z0.b, { z24.b, z25.b }
+; CHECK-NEXT: smop4a za0.s, z0.b, { z24.b, z25.b }
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.smop4a.wide.1x2.nxv16i8(i32 1, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2)
+ call void @llvm.aarch64.sme.smop4a.wide.1x2.nxv16i8(i32 0, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2)
ret void
}
@@ -20,9 +20,9 @@ define void @mop4s_za32_s8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm1, <vsc
; CHECK: // %bb.0:
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: smop4s za1.s, z0.b, { z24.b, z25.b }
+; CHECK-NEXT: smop4s za0.s, z0.b, { z24.b, z25.b }
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.smop4s.wide.1x2.nxv16i8(i32 1, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2)
+ call void @llvm.aarch64.sme.smop4s.wide.1x2.nxv16i8(i32 0, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2)
ret void
}
@@ -31,9 +31,9 @@ define void @mop4a_za32_u8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm1, <vsc
; CHECK: // %bb.0:
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: umop4a za1.s, z0.b, { z24.b, z25.b }
+; CHECK-NEXT: umop4a za0.s, z0.b, { z24.b, z25.b }
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.umop4a.wide.1x2.nxv16i8(i32 1, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2)
+ call void @llvm.aarch64.sme.umop4a.wide.1x2.nxv16i8(i32 0, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2)
ret void
}
@@ -42,9 +42,9 @@ define void @mop4s_za32_u8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm1, <vsc
; CHECK: // %bb.0:
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: umop4s za1.s, z0.b, { z24.b, z25.b }
+; CHECK-NEXT: umop4s za0.s, z0.b, { z24.b, z25.b }
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.umop4s.wide.1x2.nxv16i8(i32 1, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2)
+ call void @llvm.aarch64.sme.umop4s.wide.1x2.nxv16i8(i32 0, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2)
ret void
}
@@ -53,9 +53,9 @@ define void @mop4a_za32_s8_u8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm1, <
; CHECK: // %bb.0:
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: sumop4a za1.s, z0.b, { z24.b, z25.b }
+; CHECK-NEXT: sumop4a za0.s, z0.b, { z24.b, z25.b }
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.sumop4a.wide.1x2.nxv16i8(i32 1, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2)
+ call void @llvm.aarch64.sme.sumop4a.wide.1x2.nxv16i8(i32 0, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2)
ret void
}
@@ -64,9 +64,9 @@ define void @mop4s_za32_s8_u8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm1, <
; CHECK: // %bb.0:
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: sumop4s za1.s, z0.b, { z24.b, z25.b }
+; CHECK-NEXT: sumop4s za0.s, z0.b, { z24.b, z25.b }
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.sumop4s.wide.1x2.nxv16i8(i32 1, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2)
+ call void @llvm.aarch64.sme.sumop4s.wide.1x2.nxv16i8(i32 0, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2)
ret void
}
@@ -75,9 +75,9 @@ define void @mop4a_za32_u8_s8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm1, <
; CHECK: // %bb.0:
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: usmop4a za1.s, z0.b, { z24.b, z25.b }
+; CHECK-NEXT: usmop4a za0.s, z0.b, { z24.b, z25.b }
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.usmop4a.wide.1x2.nxv16i8(i32 1, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2)
+ call void @llvm.aarch64.sme.usmop4a.wide.1x2.nxv16i8(i32 0, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2)
ret void
}
@@ -86,9 +86,9 @@ define void @mop4s_za32_u8_s8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm1, <
; CHECK: // %bb.0:
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: usmop4s za1.s, z0.b, { z24.b, z25.b }
+; CHECK-NEXT: usmop4s za0.s, z0.b, { z24.b, z25.b }
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.usmop4s.wide.1x2.nxv16i8(i32 1, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2)
+ call void @llvm.aarch64.sme.usmop4s.wide.1x2.nxv16i8(i32 0, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2)
ret void
}
@@ -98,9 +98,9 @@ define void @mop4a_za32_s16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vs
; CHECK: // %bb.0:
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: smop4a za1.s, z0.h, { z24.h, z25.h }
+; CHECK-NEXT: smop4a za0.s, z0.h, { z24.h, z25.h }
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.smop4a.wide.1x2.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2)
+ call void @llvm.aarch64.sme.smop4a.wide.1x2.nxv8i16(i32 0, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2)
ret void
}
@@ -109,9 +109,9 @@ define void @mop4s_za32_s16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vs
; CHECK: // %bb.0:
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: smop4s za1.s, z0.h, { z24.h, z25.h }
+; CHECK-NEXT: smop4s za0.s, z0.h, { z24.h, z25.h }
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.smop4s.wide.1x2.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2)
+ call void @llvm.aarch64.sme.smop4s.wide.1x2.nxv8i16(i32 0, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2)
ret void
}
@@ -120,9 +120,9 @@ define void @mop4a_za32_u16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vs
; CHECK: // %bb.0:
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: umop4a za1.s, z0.h, { z24.h, z25.h }
+; CHECK-NEXT: umop4a za0.s, z0.h, { z24.h, z25.h }
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.umop4a.wide.1x2.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2)
+ call void @llvm.aarch64.sme.umop4a.wide.1x2.nxv8i16(i32 0, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2)
ret void
}
@@ -131,9 +131,9 @@ define void @mop4s_za32_u16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vs
; CHECK: // %bb.0:
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: umop4s za1.s, z0.h, { z24.h, z25.h }
+; CHECK-NEXT: umop4s za0.s, z0.h, { z24.h, z25.h }
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.umop4s.wide.1x2.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2)
+ call void @llvm.aarch64.sme.umop4s.wide.1x2.nxv8i16(i32 0, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2)
ret void
}
@@ -142,9 +142,9 @@ define void @mop4a_za32_f16(<vscale x 8 x half> %zn, <vscale x 8 x half> %zm1, <
; CHECK: // %bb.0:
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: fmop4a za1.s, z0.h, { z24.h, z25.h }
+; CHECK-NEXT: fmop4a za0.s, z0.h, { z24.h, z25.h }
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.mop4a.wide.1x2.nxv8f16(i32 1, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2)
+ call void @llvm.aarch64.sme.mop4a.wide.1x2.nxv8f16(i32 0, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2)
ret void
}
@@ -153,9 +153,9 @@ define void @mop4s_za32_f16(<vscale x 8 x half> %zn, <vscale x 8 x half> %zm1, <
; CHECK: // %bb.0:
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: fmop4s za1.s, z0.h, { z24.h, z25.h }
+; CHECK-NEXT: fmop4s za0.s, z0.h, { z24.h, z25.h }
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.mop4s.wide.1x2.nxv8f16(i32 1, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2)
+ call void @llvm.aarch64.sme.mop4s.wide.1x2.nxv8f16(i32 0, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2)
ret void
}
@@ -164,9 +164,9 @@ define void @mop4a_za32_bf16(<vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %z
; CHECK: // %bb.0:
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: bfmop4a za1.s, z0.h, { z24.h, z25.h }
+; CHECK-NEXT: bfmop4a za0.s, z0.h, { z24.h, z25.h }
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.mop4a.wide.1x2.nxv8bf16(i32 1, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm1, <vscale x 8 x bfloat> %zm2)
+ call void @llvm.aarch64.sme.mop4a.wide.1x2.nxv8bf16(i32 0, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm1, <vscale x 8 x bfloat> %zm2)
ret void
}
@@ -175,9 +175,9 @@ define void @mop4s_za32_bf16(<vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %z
; CHECK: // %bb.0:
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: bfmop4s za1.s, z0.h, { z24.h, z25.h }
+; CHECK-NEXT: bfmop4s za0.s, z0.h, { z24.h, z25.h }
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.mop4s.wide.1x2.nxv8bf16(i32 1, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm1, <vscale x 8 x bfloat> %zm2)
+ call void @llvm.aarch64.sme.mop4s.wide.1x2.nxv8bf16(i32 0, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm1, <vscale x 8 x bfloat> %zm2)
ret void
}
@@ -186,9 +186,9 @@ define void @mop4a_za64_s16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vs
; CHECK: // %bb.0:
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: smop4a za1.d, z0.h, { z24.h, z25.h }
+; CHECK-NEXT: smop4a za0.d, z0.h, { z24.h, z25.h }
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.smop4a.za64.wide.1x2.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2)
+ call void @llvm.aarch64.sme.smop4a.za64.wide.1x2.nxv8i16(i32 0, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2)
ret void
}
@@ -197,9 +197,9 @@ define void @mop4s_za64_s16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vs
; CHECK: // %bb.0:
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: smop4s za1.d, z0.h, { z24.h, z25.h }
+; CHECK-NEXT: smop4s za0.d, z0.h, { z24.h, z25.h }
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.smop4s.za64.wide.1x2.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2)
+ call void @llvm.aarch64.sme.smop4s.za64.wide.1x2.nxv8i16(i32 0, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2)
ret void
}
@@ -208,9 +208,9 @@ define void @mop4a_za64_u16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vs
; CHECK: // %bb.0:
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: umop4a za1.d, z0.h, { z24.h, z25.h }
+; CHECK-NEXT: umop4a za0.d, z0.h, { z24.h, z25.h }
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.umop4a.za64.wide.1x2.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2)
+ call void @llvm.aarch64.sme.umop4a.za64.wide.1x2.nxv8i16(i32 0, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2)
ret void
}
@@ -219,9 +219,9 @@ define void @mop4s_za64_u16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vs
; CHECK: // %bb.0:
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: umop4s za1.d, z0.h, { z24.h, z25.h }
+; CHECK-NEXT: umop4s za0.d, z0.h, { z24.h, z25.h }
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.umop4s.za64.wide.1x2.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2)
+ call void @llvm.aarch64.sme.umop4s.za64.wide.1x2.nxv8i16(i32 0, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2)
ret void
}
@@ -230,9 +230,9 @@ define void @mop4a_za64_s16_u16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1,
; CHECK: // %bb.0:
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: sumop4a za1.d, z0.h, { z24.h, z25.h }
+; CHECK-NEXT: sumop4a za0.d, z0.h, { z24.h, z25.h }
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.sumop4a.za64.wide.1x2.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2)
+ call void @llvm.aarch64.sme.sumop4a.za64.wide.1x2.nxv8i16(i32 0, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2)
ret void
}
@@ -241,9 +241,9 @@ define void @mop4s_za64_s16_u16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1,
; CHECK: // %bb.0:
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: sumop4s za1.d, z0.h, { z24.h, z25.h }
+; CHECK-NEXT: sumop4s za0.d, z0.h, { z24.h, z25.h }
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.sumop4s.za64.wide.1x2.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2)
+ call void @llvm.aarch64.sme.sumop4s.za64.wide.1x2.nxv8i16(i32 0, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2)
ret void
}
@@ -252,9 +252,9 @@ define void @mop4a_za64_u16_s16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1,
; CHECK: // %bb.0:
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: usmop4a za1.d, z0.h, { z24.h, z25.h }
+; CHECK-NEXT: usmop4a za0.d, z0.h, { z24.h, z25.h }
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.usmop4a.za64.wide.1x2.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2)
+ call void @llvm.aarch64.sme.usmop4a.za64.wide.1x2.nxv8i16(i32 0, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2)
ret void
}
@@ -263,9 +263,9 @@ define void @mop4s_za64_u16_s16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1,
; CHECK: // %bb.0:
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: usmop4s za1.d, z0.h, { z24.h, z25.h }
+; CHECK-NEXT: usmop4s za0.d, z0.h, { z24.h, z25.h }
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.usmop4s.za64.wide.1x2.nxv8i16(i32 1, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2)
+ call void @llvm.aarch64.sme.usmop4s.za64.wide.1x2.nxv8i16(i32 0, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2)
ret void
}
@@ -275,9 +275,9 @@ define void @mop4a_za16_f16(<vscale x 8 x half> %zn, <vscale x 8 x half> %zm1, <
; CHECK: // %bb.0:
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: fmop4a za1.h, z0.h, { z24.h, z25.h }
+; CHECK-NEXT: fmop4a za0.h, z0.h, { z24.h, z25.h }
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.mop4a.1x2.nxv8f16(i32 1, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2)
+ call void @llvm.aarch64.sme.mop4a.1x2.nxv8f16(i32 0, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2)
ret void
}
@@ -286,9 +286,9 @@ define void @mop4s_za16_f16(<vscale x 8 x half> %zn, <vscale x 8 x half> %zm1, <
; CHECK: // %bb.0:
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: fmop4s za1.h, z0.h, { z24.h, z25.h }
+; CHECK-NEXT: fmop4s za0.h, z0.h, { z24.h, z25.h }
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.mop4s.1x2.nxv8f16(i32 1, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2)
+ call void @llvm.aarch64.sme.mop4s.1x2.nxv8f16(i32 0, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2)
ret void
}
@@ -297,9 +297,9 @@ define void @mop4a_za32_f32(<vscale x 4 x float> %zn, <vscale x 4 x float> %zm1,
; CHECK: // %bb.0:
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: fmop4a za1.s, z0.s, { z24.s, z25.s }
+; CHECK-NEXT: fmop4a za0.s, z0.s, { z24.s, z25.s }
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.mop4a.1x2.nxv4f32(i32 1, <vscale x 4 x float> %zn, <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2)
+ call void @llvm.aarch64.sme.mop4a.1x2.nxv4f32(i32 0, <vscale x 4 x float> %zn, <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2)
ret void
}
@@ -308,9 +308,9 @@ define void @mop4s_za32_f32(<vscale x 4 x float> %zn, <vscale x 4 x float> %zm1,
; CHECK: // %bb.0:
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: fmop4s za1.s, z0.s, { z24.s, z25.s }
+; CHECK-NEXT: fmop4s za0.s, z0.s, { z24.s, z25.s }
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.mop4s.1x2.nxv4f32(i32 1, <vscale x 4 x float> %zn, <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2)
+ call void @llvm.aarch64.sme.mop4s.1x2.nxv4f32(i32 0, <vscale x 4 x float> %zn, <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2)
ret void
}
@@ -319,9 +319,9 @@ define void @mop4a_za64_f64(<vscale x 2 x double> %zn, <vscale x 2 x double> %zm
; CHECK: // %bb.0:
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: fmop4a za1.d, z0.d, { z24.d, z25.d }
+; CHECK-NEXT: fmop4a za0.d, z0.d, { z24.d, z25.d }
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.mop4a.1x2.nxv2f64(i32 1, <vscale x 2 x double> %zn, <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2)
+ call void @llvm.aarch64.sme.mop4a.1x2.nxv2f64(i32 0, <vscale x 2 x double> %zn, <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2)
ret void
}
@@ -330,9 +330,9 @@ define void @mop4s_za64_f64(<vscale x 2 x double> %zn, <vscale x 2 x double> %zm
; CHECK: // %bb.0:
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: fmop4s za1.d, z0.d, { z24.d, z25.d }
+; CHECK-NEXT: fmop4s za0.d, z0.d, { z24.d, z25.d }
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.mop4s.1x2.nxv2f64(i32 1, <vscale x 2 x double> %zn, <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2)
+ call void @llvm.aarch64.sme.mop4s.1x2.nxv2f64(i32 0, <vscale x 2 x double> %zn, <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2)
ret void
}
@@ -341,9 +341,9 @@ define void @mop4a_za16_bf16(<vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %z
; CHECK: // %bb.0:
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: mov z24.d, z1.d
-; CHECK-NEXT: bfmop4a za1.h, z0.h, { z24.h, z25.h }
+; CHECK-NEXT: bfmop4a za0.h, z0.h, { z24.h, z25.h }
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.mop4a.1x2.nxv8bf16(i32 1, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm1, <vscale x 8 x bfloat> %zm2)
+ call void @llvm.aarch64.sme.mop4a.1x2.nxv8bf16(i32 0, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm1, <vscale x 8 x bfloat> %zm2)
ret void
}
@@ -352,6 +352,107 @@ define void @mop4s_za16_bf16(<vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %z
; CHECK: // %bb.0:
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: bfmop4s za0.h, z0.h, { z24.h, z25.h }
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4s.1x2.nxv8bf16(i32 0, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm1, <vscale x 8 x bfloat> %zm2)
+ ret void
+}
+
+; Tile limits
+
+define void @mop4s_za32_s8_limit(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2) #0 {
+; CHECK-LABEL: mop4s_za32_s8_limit:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z25.d, z2.d
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: smop4s za3.s, z0.b, { z24.b, z25.b }
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.smop4s.wide.1x2.nxv16i8(i32 3, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2)
+ ret void
+}
+
+define void @mop4s_za32_s16_limit(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2) #0 {
+; CHECK-LABEL: mop4s_za32_s16_limit:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z25.d, z2.d
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: smop4s za3.s, z0.h, { z24.h, z25.h }
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.smop4s.wide.1x2.nxv8i16(i32 3, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2)
+ ret void
+}
+
+define void @mop4s_za32_f16_limit(<vscale x 8 x half> %zn, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2) #0 {
+; CHECK-LABEL: mop4s_za32_f16_limit:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z25.d, z2.d
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: fmop4s za3.s, z0.h, { z24.h, z25.h }
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4s.wide.1x2.nxv8f16(i32 3, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2)
+ ret void
+}
+
+define void @mop4s_za32_bf16_limit(<vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm1, <vscale x 8 x bfloat> %zm2) #0 {
+; CHECK-LABEL: mop4s_za32_bf16_limit:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z25.d, z2.d
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: bfmop4s za3.s, z0.h, { z24.h, z25.h }
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4s.wide.1x2.nxv8bf16(i32 3, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm1, <vscale x 8 x bfloat> %zm2)
+ ret void
+}
+
+define void @mop4s_za64_s16_limit(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2) #0 {
+; CHECK-LABEL: mop4s_za64_s16_limit:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z25.d, z2.d
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: smop4s za7.d, z0.h, { z24.h, z25.h }
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.smop4s.za64.wide.1x2.nxv8i16(i32 7, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2)
+ ret void
+}
+
+define void @mop4s_za64_f64_limit(<vscale x 2 x double> %zn, <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2) #0 {
+; CHECK-LABEL: mop4s_za64_f64_limit:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z25.d, z2.d
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: fmop4s za7.d, z0.d, { z24.d, z25.d }
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4s.1x2.nxv2f64(i32 7, <vscale x 2 x double> %zn, <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2)
+ ret void
+}
+
+define void @mop4s_za32_f32_limit(<vscale x 4 x float> %zn, <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2) #0 {
+; CHECK-LABEL: mop4s_za32_f32_limit:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z25.d, z2.d
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: fmop4s za3.s, z0.s, { z24.s, z25.s }
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4s.1x2.nxv4f32(i32 3, <vscale x 4 x float> %zn, <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2)
+ ret void
+}
+
+define void @mop4s_za16_f16_limit(<vscale x 8 x half> %zn, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2) #0 {
+; CHECK-LABEL: mop4s_za16_f16_limit:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z25.d, z2.d
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: fmop4s za1.h, z0.h, { z24.h, z25.h }
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4s.1x2.nxv8f16(i32 1, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2)
+ ret void
+}
+
+define void @mop4s_za16_bf16_limit(<vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm1, <vscale x 8 x bfloat> %zm2) #0 {
+; CHECK-LABEL: mop4s_za16_bf16_limit:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z25.d, z2.d
+; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: bfmop4s za1.h, z0.h, { z24.h, z25.h }
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.mop4s.1x2.nxv8bf16(i32 1, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm1, <vscale x 8 x bfloat> %zm2)
>From bdadfbbe14ed3967d9654abb127b107056a94788 Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Wed, 26 Feb 2025 12:01:22 +0000
Subject: [PATCH 11/11] [Clang][LLVM] Implement multi-single vectors MOP4{A/S}
Implement all multi-single {BF/F/S/U/SU/US}MOP4{A/S} instructions in clang and
llvm following the acle in https://github.com/ARM-software/acle/pull/381/files.
This PR depends on #128854
---
clang/include/clang/Basic/arm_sme.td | 9 +
.../sme2-intrinsics/acle_sme2_mop4_2x1.c | 304 ++++++++++++++
llvm/include/llvm/IR/IntrinsicsAArch64.td | 31 +-
llvm/lib/Target/AArch64/SMEInstrFormats.td | 84 +++-
.../AArch64/sme2-intrinsics-mop4a_2x1.ll | 393 ++++++++++++++++++
5 files changed, 810 insertions(+), 11 deletions(-)
create mode 100644 clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_2x1.c
create mode 100644 llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_2x1.ll
diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index b8fbfae3060d3..e0e8e022418b0 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -379,6 +379,7 @@ let SMETargetGuard = "sme2" in {
multiclass MOP4<string name, string n, string t, string i, string wide, list<ImmCheck> checks> {
def NAME # "_1x1" : Inst<"svmop4" # name # "_1x1_" # n # "[_{d}_{d}]", "vidd", t, MergeNone, i # wide # "_1x1", [IsInOutZA, IsStreaming], checks>;
def NAME # "_1x2" : Inst<"svmop4" # name # "_1x2_" # n # "[_{d}_{d}]", "vid2", t, MergeNone, i # wide # "_1x2", [IsInOutZA, IsStreaming], checks>;
+ def NAME # "_2x1" : Inst<"svmop4" # name # "_2x1_" # n # "[_{d}_{d}]", "vi2d", t, MergeNone, i # wide # "_2x1", [IsInOutZA, IsStreaming], checks>;
}
multiclass SUMOP4<string s, string za, string t, string i, list<ImmCheck> checks> {
@@ -390,6 +391,10 @@ multiclass SUMOP4<string s, string za, string t, string i, list<ImmCheck> checks
"vid2.u", t, MergeNone, "aarch64_sme_sumop4" # s # i # "_wide_1x2",
[IsStreaming, IsInOutZA],
checks>;
+ def _2x1 : SInst<"svmop4" # s # "[_2x1_]" # za # "[_{2}_{3}]",
+ "vi2u", t, MergeNone, "aarch64_sme_sumop4" # s # i # "_wide_2x1",
+ [IsStreaming, IsInOutZA],
+ checks>;
}
multiclass USMOP4<string s, string za, string t, string i, list<ImmCheck> checks> {
@@ -401,6 +406,10 @@ multiclass USMOP4<string s, string za, string t, string i, list<ImmCheck> checks
"vid2.x", t, MergeNone, "aarch64_sme_usmop4" # s # i # "_wide_1x2",
[IsStreaming, IsInOutZA],
checks>;
+ def _2x1 : SInst<"svmop4" # s # "[_2x1_]" # za # "[_{2}_{3}]",
+ "vi2x", t, MergeNone, "aarch64_sme_usmop4" # s # i # "_wide_2x1",
+ [IsStreaming, IsInOutZA],
+ checks>;
}
let SMETargetGuard = "sme2" in {
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_2x1.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_2x1.c
new file mode 100644
index 0000000000000..3ec51cd9a0bd4
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_2x1.c
@@ -0,0 +1,304 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+
+#include <arm_sme.h>
+
+#ifdef SME_OVERLOADED_FORMS
+#define SME_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
+#else
+#define SME_ACLE_FUNC(A1,A2,A3) A1##A2##A3
+#endif
+
+// CHECK-LABEL: @test_svmop4a_2x1_za32_s8_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.wide.2x1.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+void test_svmop4a_2x1_za32_s8_s8(svint8x2_t zn, svint8_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_2x1_za32,_s8_s8,)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_2x1_za32_s8_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.wide.2x1.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+void test_svmop4s_2x1_za32_s8_s8(svint8x2_t zn, svint8_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s_2x1_za32,_s8_s8,)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_2x1_za32_u8_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.wide.2x1.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+void test_svmop4a_2x1_za32_u8_u8(svuint8x2_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_2x1_za32,_u8_u8,)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_2x1_za32_u8_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.wide.2x1.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+void test_svmop4s_2x1_za32_u8_u8(svuint8x2_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s_2x1_za32,_u8_u8,)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_2x1_za32_s8_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4a.wide.2x1.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+void test_svmop4a_2x1_za32_s8_u8(svint8x2_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_2x1_za32,_s8_u8,)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_2x1_za32_s8_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4s.wide.2x1.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+void test_svmop4s_2x1_za32_s8_u8(svint8x2_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s_2x1_za32,_s8_u8,)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_2x1_za32_u8_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4a.wide.2x1.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+void test_svmop4a_2x1_za32_u8_s8(svuint8x2_t zn, svint8_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_2x1_za32,_u8_s8,)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_2x1_za32_u8_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4s.wide.2x1.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+void test_svmop4s_2x1_za32_u8_s8(svuint8x2_t zn, svint8_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s_2x1_za32,_u8_s8,)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_2x1_za32_s16_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.wide.2x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+void test_svmop4a_2x1_za32_s16_s16(svint16x2_t zn, svint16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_2x1_za32,_s16_s16,)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_2x1_za32_s16_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.wide.2x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+void test_svmop4s_2x1_za32_s16_s16(svint16x2_t zn, svint16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s_2x1_za32,_s16_s16,)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_2x1_za32_u16_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.wide.2x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+void test_svmop4a_2x1_za32_u16_u16(svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_2x1_za32,_u16_u16,)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_2x1_za32_u16_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.wide.2x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+void test_svmop4s_2x1_za32_u16_u16(svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s_2x1_za32,_u16_u16,)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_2x1_za32_f16_f16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.2x1.nxv8f16(i32 1, <vscale x 8 x half> [[ZN_COERCE0:%.*]], <vscale x 8 x half> [[ZN_COERCE1:%.*]], <vscale x 8 x half> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+void test_svmop4a_2x1_za32_f16_f16(svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_2x1_za32,_f16_f16,)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_2x1_za32_f16_f16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.2x1.nxv8f16(i32 1, <vscale x 8 x half> [[ZN_COERCE0:%.*]], <vscale x 8 x half> [[ZN_COERCE1:%.*]], <vscale x 8 x half> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+void test_svmop4s_2x1_za32_f16_f16(svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s_2x1_za32,_f16_f16,)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_2x1_za32_bf16_bf16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.2x1.nxv8bf16(i32 1, <vscale x 8 x bfloat> [[ZN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+void test_svmop4a_2x1_za32_bf16_bf16(svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_2x1_za32,_bf16_bf16,)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_2x1_za32_bf16_bf16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.2x1.nxv8bf16(i32 1, <vscale x 8 x bfloat> [[ZN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+void test_svmop4s_2x1_za32_bf16_bf16(svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s_2x1_za32,_bf16_bf16,)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_2x1_za64_s16_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.za64.wide.2x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+void test_svmop4a_2x1_za64_s16_s16(svint16x2_t zn, svint16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_2x1_za64,_s16_s16,)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_2x1_za64_s16_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.za64.wide.2x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+void test_svmop4s_2x1_za64_s16_s16(svint16x2_t zn, svint16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s_2x1_za64,_s16_s16,)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_2x1_za64_u16_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.za64.wide.2x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+void test_svmop4a_2x1_za64_u16_u16(svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_2x1_za64,_u16_u16,)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_2x1_za64_u16_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.za64.wide.2x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+void test_svmop4s_2x1_za64_u16_u16(svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s_2x1_za64,_u16_u16,)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_2x1_za64_s16_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4a.za64.wide.2x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+void test_svmop4a_2x1_za64_s16_u16(svint16x2_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_2x1_za64,_s16_u16,)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_2x1_za64_s16_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4s.za64.wide.2x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+void test_svmop4s_2x1_za64_s16_u16(svint16x2_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s_2x1_za64,_s16_u16,)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_2x1_za64_u16_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4a.za64.wide.2x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+void test_svmop4a_2x1_za64_u16_s16(svuint16x2_t zn, svint16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_2x1_za64,_u16_s16,)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_2x1_za64_u16_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4s.za64.wide.2x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+void test_svmop4s_2x1_za64_u16_s16(svuint16x2_t zn, svint16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s_2x1_za64,_u16_s16,)(1, zn, zm);
+}
+
+
+// CHECK-LABEL: @test_svmop4a_2x1_za16_f16_f16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.2x1.nxv8f16(i32 1, <vscale x 8 x half> [[ZN_COERCE0:%.*]], <vscale x 8 x half> [[ZN_COERCE1:%.*]], <vscale x 8 x half> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+void test_svmop4a_2x1_za16_f16_f16(svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_2x1_za16,_f16_f16,)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_2x1_za16_f16_f16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.2x1.nxv8f16(i32 1, <vscale x 8 x half> [[ZN_COERCE0:%.*]], <vscale x 8 x half> [[ZN_COERCE1:%.*]], <vscale x 8 x half> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+void test_svmop4s_2x1_za16_f16_f16(svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s_2x1_za16,_f16_f16,)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_2x1_za32_f32_f32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.2x1.nxv4f32(i32 1, <vscale x 4 x float> [[ZN_COERCE0:%.*]], <vscale x 4 x float> [[ZN_COERCE1:%.*]], <vscale x 4 x float> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+void test_svmop4a_2x1_za32_f32_f32(svfloat32x2_t zn, svfloat32_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_2x1_za32,_f32_f32,)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_2x1_za32_f32_f32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.2x1.nxv4f32(i32 1, <vscale x 4 x float> [[ZN_COERCE0:%.*]], <vscale x 4 x float> [[ZN_COERCE1:%.*]], <vscale x 4 x float> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+void test_svmop4s_2x1_za32_f32_f32(svfloat32x2_t zn, svfloat32_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s_2x1_za32,_f32_f32,)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_2x1_za64_f64_f64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.2x1.nxv2f64(i32 1, <vscale x 2 x double> [[ZN_COERCE0:%.*]], <vscale x 2 x double> [[ZN_COERCE1:%.*]], <vscale x 2 x double> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+void test_svmop4a_2x1_za64_f64_f64(svfloat64x2_t zn, svfloat64_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_2x1_za64,_f64_f64,)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_2x1_za64_f64_f64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.2x1.nxv2f64(i32 1, <vscale x 2 x double> [[ZN_COERCE0:%.*]], <vscale x 2 x double> [[ZN_COERCE1:%.*]], <vscale x 2 x double> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+void test_svmop4s_2x1_za64_f64_f64(svfloat64x2_t zn, svfloat64_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s_2x1_za64,_f64_f64,)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_2x1_za16_bf16_bf16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.2x1.nxv8bf16(i32 1, <vscale x 8 x bfloat> [[ZN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+void test_svmop4a_2x1_za16_bf16_bf16(svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_2x1_za16,_bf16_bf16,)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_2x1_za16_bf16_bf16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.2x1.nxv8bf16(i32 1, <vscale x 8 x bfloat> [[ZN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+void test_svmop4s_2x1_za16_bf16_bf16(svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s_2x1_za16,_bf16_bf16,)(1, zn, zm);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 222c3d63cd68c..cd88445f7455d 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -3133,7 +3133,36 @@ let TargetPrefix = "aarch64" in {
def int_aarch64_sme_sumop4s_za64_wide_1x2 : SME_OuterProduct_QuaterTile_Multi;
def int_aarch64_sme_usmop4a_za64_wide_1x2 : SME_OuterProduct_QuaterTile_Multi;
def int_aarch64_sme_usmop4s_za64_wide_1x2 : SME_OuterProduct_QuaterTile_Multi;
-
+
+ def int_aarch64_sme_mop4a_wide_2x1 : SME_OuterProduct_QuaterTile_Multi;
+ def int_aarch64_sme_mop4s_wide_2x1 : SME_OuterProduct_QuaterTile_Multi;
+ def int_aarch64_sme_mop4a_2x1 : SME_OuterProduct_QuaterTile_Multi;
+ def int_aarch64_sme_mop4s_2x1 : SME_OuterProduct_QuaterTile_Multi;
+ def int_aarch64_sme_smop4a_wide_2x1 : SME_OuterProduct_QuaterTile_Multi;
+ def int_aarch64_sme_smop4s_wide_2x1 : SME_OuterProduct_QuaterTile_Multi;
+ def int_aarch64_sme_smop4a_2x1 : SME_OuterProduct_QuaterTile_Multi;
+ def int_aarch64_sme_smop4s_2x1 : SME_OuterProduct_QuaterTile_Multi;
+ def int_aarch64_sme_umop4a_wide_2x1 : SME_OuterProduct_QuaterTile_Multi;
+ def int_aarch64_sme_umop4s_wide_2x1 : SME_OuterProduct_QuaterTile_Multi;
+ def int_aarch64_sme_umop4a_2x1 : SME_OuterProduct_QuaterTile_Multi;
+ def int_aarch64_sme_umop4s_2x1 : SME_OuterProduct_QuaterTile_Multi;
+ def int_aarch64_sme_sumop4a_wide_2x1 : SME_OuterProduct_QuaterTile_Multi;
+ def int_aarch64_sme_sumop4s_wide_2x1 : SME_OuterProduct_QuaterTile_Multi;
+ def int_aarch64_sme_sumop4a_2x1 : SME_OuterProduct_QuaterTile_Multi;
+ def int_aarch64_sme_sumop4s_2x1 : SME_OuterProduct_QuaterTile_Multi;
+ def int_aarch64_sme_usmop4a_wide_2x1 : SME_OuterProduct_QuaterTile_Multi;
+ def int_aarch64_sme_usmop4s_wide_2x1 : SME_OuterProduct_QuaterTile_Multi;
+ def int_aarch64_sme_usmop4a_2x1 : SME_OuterProduct_QuaterTile_Multi;
+ def int_aarch64_sme_usmop4s_2x1 : SME_OuterProduct_QuaterTile_Multi;
+ def int_aarch64_sme_smop4a_za64_wide_2x1 : SME_OuterProduct_QuaterTile_Multi;
+ def int_aarch64_sme_smop4s_za64_wide_2x1 : SME_OuterProduct_QuaterTile_Multi;
+ def int_aarch64_sme_umop4a_za64_wide_2x1 : SME_OuterProduct_QuaterTile_Multi;
+ def int_aarch64_sme_umop4s_za64_wide_2x1 : SME_OuterProduct_QuaterTile_Multi;
+ def int_aarch64_sme_sumop4a_za64_wide_2x1 : SME_OuterProduct_QuaterTile_Multi;
+ def int_aarch64_sme_sumop4s_za64_wide_2x1 : SME_OuterProduct_QuaterTile_Multi;
+ def int_aarch64_sme_usmop4a_za64_wide_2x1 : SME_OuterProduct_QuaterTile_Multi;
+ def int_aarch64_sme_usmop4s_za64_wide_2x1 : SME_OuterProduct_QuaterTile_Multi;
+
class SME_AddVectorToTile_Intrinsic
: DefaultAttrsIntrinsic<[],
[llvm_i32_ty,
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index 92088da1873a7..25a607e54cf91 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -122,6 +122,15 @@ class sme2_quarter_tile_outer_product_pseudo_single_multi<ZPRRegOp zn_ty, Regist
let usesCustomInserter = 1;
}
+class sme2_quarter_tile_outer_product_pseudo_multi_single<RegisterOperand zn_ty, ZPRRegOp zm_ty, SMEMatrixTypeEnum za_flag>
+ : Pseudo<(outs), (ins i32imm:$tile,
+ zn_ty:$zn, zm_ty:$zm), []>,
+ Sched<[]> {
+ // Translated to the actual instructions in AArch64ISelLowering.cpp
+ let SMEMatrixType = za_flag;
+ let usesCustomInserter = 1;
+}
+
class sme2_za_array_2op_multi_single_pseudo<string name, Operand index_ty, RegisterOperand multi_vector_ty,
ZPRRegOp zpr_ty, SMEMatrixTypeEnum za_flag>
: SMEPseudo2Instr<name, 0>,
@@ -282,6 +291,11 @@ class SME2_ZA_Tile_TwoVec_Pat<string name, SDPatternOperator intrinsic, Operand
class SME2_ZA_Tile_Vec_Multi_Pat<string name, SDPatternOperator intrinsic, Operand imm_ty, ValueType vt>
: Pat<(intrinsic imm_ty:$tile, vt:$Zn, vt:$Zm1, vt:$Zm2),
(!cast<Instruction>(name # _PSEUDO) $tile, $Zn, (REG_SEQUENCE ZPR2Mul2, vt:$Zm1, zsub0, vt:$Zm2, zsub1))>;
+
+class SME2_ZA_Tile_Vec_Multi_Single_Pat<string name, SDPatternOperator intrinsic, Operand imm_ty, ValueType vt>
+ : Pat<(intrinsic imm_ty:$tile, vt:$Zn1, vt:$Zn2, vt:$Zm),
+ (!cast<Instruction>(name # _PSEUDO) $tile, (REG_SEQUENCE ZPR2Mul2, vt:$Zn1, zsub0, vt:$Zn2, zsub1), $Zm)>;
+
//===----------------------------------------------------------------------===//
// SME pattern match helpers.
//===----------------------------------------------------------------------===//
@@ -625,6 +639,7 @@ class sme_quarter_outer_product_i16_i32<bit u0, bit N, bit M, bit subtr, Registe
}
multiclass sme_quarter_outer_product_i8_i32<bit zn_u, bit zm_u, bit subtr, string mnemonic, string op>{
+ // Single vectors
def _MZZ_BToS : sme_quarter_outer_product_i8_i32<{zn_u, 0}, {zm_u, 0}, subtr,
ZPR8Mul2_Lo, ZPR8Mul2_Hi, mnemonic>, SMEPseudo2Instr<NAME # _MZZ_BToS, 1>;
@@ -632,8 +647,15 @@ multiclass sme_quarter_outer_product_i8_i32<bit zn_u, bit zm_u, bit subtr, strin
def : SME2_ZA_Tile_TwoVec_Pat<NAME # _MZZ_BToS, !cast<SDPatternOperator>(op # "_1x1"), timm32_0_3, nxv16i8>;
+ // Multiple and single vectors
def _M2ZZ_BToS : sme_quarter_outer_product_i8_i32<{zn_u, 1}, {zm_u, 0}, subtr,
- ZZ_b_mul_r_Lo, ZPR8Mul2_Hi, mnemonic>;
+ ZZ_b_mul_r_Lo, ZPR8Mul2_Hi, mnemonic>, SMEPseudo2Instr<NAME # _M2ZZ_BToS, 1>;
+
+ def NAME # _M2ZZ_BToS # _PSEUDO : sme2_quarter_tile_outer_product_pseudo_multi_single<ZZ_b_mul_r_Lo, ZPR8Mul2_Hi, SMEMatrixTileS>, SMEPseudo2Instr<NAME # _M2ZZ_BToS, 0>;
+
+ def : SME2_ZA_Tile_Vec_Multi_Single_Pat<NAME # _M2ZZ_BToS, !cast<SDPatternOperator>(op # "_2x1"), timm32_0_3, nxv16i8>;
+
+ // Single and multiple vectors
def _MZ2Z_BToS : sme_quarter_outer_product_i8_i32<{zn_u, 0}, {zm_u, 1}, subtr,
ZPR8Mul2_Lo, ZZ_b_mul_r_Hi, mnemonic>, SMEPseudo2Instr<NAME # _MZ2Z_BToS, 1>;
@@ -641,11 +663,13 @@ multiclass sme_quarter_outer_product_i8_i32<bit zn_u, bit zm_u, bit subtr, strin
def : SME2_ZA_Tile_Vec_Multi_Pat<NAME # _MZ2Z_BToS, !cast<SDPatternOperator>(op # "_1x2"), timm32_0_3, nxv16i8>;
+ // Multiple vectors
def _M2Z2Z_BToS : sme_quarter_outer_product_i8_i32<{zn_u, 1}, {zm_u, 1}, subtr,
ZZ_b_mul_r_Lo, ZZ_b_mul_r_Hi, mnemonic>;
}
multiclass sme_quarter_outer_product_i16_i32<bit unsigned, bit subtr, string mnemonic, string op>{
+ // Single vectors
def _MZZ_HToS : sme_quarter_outer_product_i16_i32<unsigned, 0b0, 0b0, subtr,
ZPR16Mul2_Lo, ZPR16Mul2_Hi, mnemonic>, SMEPseudo2Instr<NAME # _MZZ_HToS, 1>;
@@ -653,8 +677,15 @@ multiclass sme_quarter_outer_product_i16_i32<bit unsigned, bit subtr, string mne
def : SME2_ZA_Tile_TwoVec_Pat<NAME # _MZZ_HToS, !cast<SDPatternOperator>(op # "_1x1"), timm32_0_3, nxv8i16>;
+ // Multiple and single vectors
def _M2ZZ_HToS : sme_quarter_outer_product_i16_i32<unsigned, 0b1, 0b0, subtr,
- ZZ_h_mul_r_Lo, ZPR16Mul2_Hi, mnemonic>;
+ ZZ_h_mul_r_Lo, ZPR16Mul2_Hi, mnemonic>, SMEPseudo2Instr<NAME # _M2ZZ_HToS, 1>;
+
+ def NAME # _M2ZZ_HToS # _PSEUDO : sme2_quarter_tile_outer_product_pseudo_multi_single<ZZ_h_mul_r_Lo, ZPR16Mul2_Hi, SMEMatrixTileS>, SMEPseudo2Instr<NAME # _M2ZZ_HToS, 0>;
+
+ def : SME2_ZA_Tile_Vec_Multi_Single_Pat<NAME # _M2ZZ_HToS, !cast<SDPatternOperator>(op # "_2x1"), timm32_0_3, nxv8i16>;
+
+ // Single and multiple vectors
def _MZ2Z_HToS : sme_quarter_outer_product_i16_i32<unsigned, 0b0, 0b1, subtr,
ZPR16Mul2_Lo, ZZ_h_mul_r_Hi, mnemonic>, SMEPseudo2Instr<NAME # _MZ2Z_HToS, 1>;
@@ -662,11 +693,13 @@ multiclass sme_quarter_outer_product_i16_i32<bit unsigned, bit subtr, string mne
def : SME2_ZA_Tile_Vec_Multi_Pat<NAME # _MZ2Z_HToS, !cast<SDPatternOperator>(op # "_1x2"), timm32_0_3, nxv8i16>;
+ // Multiple vectors
def _M2Z2Z_HToS : sme_quarter_outer_product_i16_i32<unsigned, 0b1, 0b1, subtr,
ZZ_h_mul_r_Lo, ZZ_h_mul_r_Hi, mnemonic>;
}
multiclass sme_quarter_outer_product_i64<bit zn_u, bit zm_u, bit subtr, string mnemonic, string op>{
+ // Single vectors
def _MZZ_HtoD : sme_quarter_outer_product_i64<{zn_u, 0}, {zm_u, 0}, subtr,
ZPR16Mul2_Lo, ZPR16Mul2_Hi, mnemonic>, SMEPseudo2Instr<NAME # _MZZ_HtoD, 1>;
@@ -674,8 +707,15 @@ multiclass sme_quarter_outer_product_i64<bit zn_u, bit zm_u, bit subtr, string m
def : SME2_ZA_Tile_TwoVec_Pat<NAME # _MZZ_HtoD, !cast<SDPatternOperator>(op # "_1x1"), timm32_0_7, nxv8i16>;
+ // Multiple and single vectors
def _M2ZZ_HtoD : sme_quarter_outer_product_i64<{zn_u, 1}, {zm_u, 0}, subtr,
- ZZ_h_mul_r_Lo, ZPR16Mul2_Hi, mnemonic>;
+ ZZ_h_mul_r_Lo, ZPR16Mul2_Hi, mnemonic>, SMEPseudo2Instr<NAME # _M2ZZ_HtoD, 1>;
+
+ def NAME # _M2ZZ_HtoD # _PSEUDO : sme2_quarter_tile_outer_product_pseudo_multi_single<ZZ_h_mul_r_Lo, ZPR16Mul2_Hi, SMEMatrixTileD>, SMEPseudo2Instr<NAME # _M2ZZ_HtoD, 0>;
+
+ def : SME2_ZA_Tile_Vec_Multi_Single_Pat<NAME # _M2ZZ_HtoD, !cast<SDPatternOperator>(op # "_2x1"), timm32_0_7, nxv8i16>;
+
+ // Single and multiple vectors
def _MZ2Z_HtoD : sme_quarter_outer_product_i64<{zn_u, 0}, {zm_u, 1}, subtr,
ZPR16Mul2_Lo, ZZ_h_mul_r_Hi, mnemonic>, SMEPseudo2Instr<NAME # _MZ2Z_HtoD, 1>;
@@ -683,6 +723,7 @@ multiclass sme_quarter_outer_product_i64<bit zn_u, bit zm_u, bit subtr, string m
def : SME2_ZA_Tile_Vec_Multi_Pat<NAME # _MZ2Z_HtoD, !cast<SDPatternOperator>(op # "_1x2"), timm32_0_7, nxv8i16>;
+ // Multiple vectors
def _M2Z2Z_HtoD : sme_quarter_outer_product_i64<{zn_u, 1}, {zm_u, 1}, subtr,
ZZ_h_mul_r_Lo, ZZ_h_mul_r_Hi, mnemonic>;
}
@@ -5533,7 +5574,11 @@ multiclass sme2_bfmop4as_widening<bit S, string mnemonic, string op> {
def : SME2_ZA_Tile_TwoVec_Pat<NAME # _MZZ_S, !cast<SDPatternOperator>(op # "_1x1"), timm32_0_3, nxv8bf16>;
// Multiple and single vectors
- def _M2ZZ_S : sme2_bf16_fp32_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZPR16Mul2_Hi>;
+ def _M2ZZ_S : sme2_bf16_fp32_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZPR16Mul2_Hi>, SMEPseudo2Instr<NAME # _M2ZZ_S, 1>;
+
+ def NAME # _M2ZZ_S # _PSEUDO : sme2_quarter_tile_outer_product_pseudo_multi_single<ZZ_h_mul_r_Lo, ZPR16Mul2_Hi, SMEMatrixTileS>, SMEPseudo2Instr<NAME # _M2ZZ_S, 0>;
+
+ def : SME2_ZA_Tile_Vec_Multi_Single_Pat<NAME # _M2ZZ_S, !cast<SDPatternOperator>(op # "_2x1"), timm32_0_3, nxv8bf16>;
// Single and multiple vectors
def _MZ2Z_S : sme2_bf16_fp32_quarter_tile_outer_product<1, 0, S, mnemonic, ZPR16Mul2_Lo, ZZ_h_mul_r_Hi>, SMEPseudo2Instr<NAME # _MZ2Z_S, 1>;
@@ -5542,7 +5587,6 @@ multiclass sme2_bfmop4as_widening<bit S, string mnemonic, string op> {
def : SME2_ZA_Tile_Vec_Multi_Pat<NAME # _MZ2Z_S, !cast<SDPatternOperator>(op # "_1x2"), timm32_0_3, nxv8bf16>;
-
// Multiple vectors
def _M2Z2Z_S : sme2_bf16_fp32_quarter_tile_outer_product<1, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZZ_h_mul_r_Hi>;
}
@@ -5689,7 +5733,11 @@ multiclass sme2_fmop4as_fp16_non_widening<bit S, string mnemonic, string op> {
def : SME2_ZA_Tile_TwoVec_Pat<NAME # _MZZ_H, !cast<SDPatternOperator>(op # "_1x1"), timm32_0_1, nxv8f16>;
// Multiple and single vectors
- def _M2ZZ_H : sme2_fp16_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZPR16Mul2_Hi>;
+ def _M2ZZ_H : sme2_fp16_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZPR16Mul2_Hi>, SMEPseudo2Instr<NAME # _M2ZZ_H, 1>;
+
+ def NAME # _M2ZZ_H # _PSEUDO : sme2_quarter_tile_outer_product_pseudo_multi_single<ZZ_h_mul_r_Lo, ZPR16Mul2_Hi, SMEMatrixTileH>, SMEPseudo2Instr<NAME # _M2ZZ_H, 0>;
+
+ def : SME2_ZA_Tile_Vec_Multi_Single_Pat<NAME # _M2ZZ_H, !cast<SDPatternOperator>(op # "_2x1"), timm32_0_1, nxv8f16>;
// Single and multiple vectors
def _MZ2Z_H : sme2_fp16_quarter_tile_outer_product<1, 0, S, mnemonic, ZPR16Mul2_Lo, ZZ_h_mul_r_Hi>, SMEPseudo2Instr<NAME # _MZ2Z_H, 1>;
@@ -5769,7 +5817,11 @@ multiclass sme2_bfmop4as_non_widening<bit S, string mnemonic, string op> {
def : SME2_ZA_Tile_TwoVec_Pat<NAME # _MZZ_H, !cast<SDPatternOperator>(op # "_1x1"), timm32_0_1, nxv8bf16>;
// Multiple and single vectors
- def _M2ZZ_H : sme2_bf16_fp16_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZPR16Mul2_Hi>;
+ def _M2ZZ_H : sme2_bf16_fp16_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZPR16Mul2_Hi>, SMEPseudo2Instr<NAME # _M2ZZ_H, 1>;
+
+ def NAME # _M2ZZ_H # _PSEUDO : sme2_quarter_tile_outer_product_pseudo_multi_single<ZZ_h_mul_r_Lo, ZPR16Mul2_Hi, SMEMatrixTileH>, SMEPseudo2Instr<NAME # _M2ZZ_H, 0>;
+
+ def : SME2_ZA_Tile_Vec_Multi_Single_Pat<NAME # _M2ZZ_H, !cast<SDPatternOperator>(op # "_2x1"), timm32_0_1, nxv8bf16>;
// Single and multiple vectors
def _MZ2Z_H : sme2_bf16_fp16_quarter_tile_outer_product<1, 0, S, mnemonic, ZPR16Mul2_Lo, ZZ_h_mul_r_Hi>, SMEPseudo2Instr<NAME # _MZ2Z_H, 1>;
@@ -5814,7 +5866,11 @@ multiclass sme2_fmop4as_fp32_non_widening<bit S, string mnemonic, string op> {
def : SME2_ZA_Tile_TwoVec_Pat<NAME # _MZZ_S, !cast<SDPatternOperator>(op # "_1x1"), timm32_0_3, nxv4f32>;
// Multiple and single vectors
- def _M2ZZ_S : sme2_fp32_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_s_mul_r_Lo, ZPR32Mul2_Hi>;
+ def _M2ZZ_S : sme2_fp32_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_s_mul_r_Lo, ZPR32Mul2_Hi>, SMEPseudo2Instr<NAME # _M2ZZ_S, 1>;
+
+ def NAME # _M2ZZ_S # _PSEUDO : sme2_quarter_tile_outer_product_pseudo_multi_single<ZZ_s_mul_r_Lo, ZPR32Mul2_Hi, SMEMatrixTileS>, SMEPseudo2Instr<NAME # _M2ZZ_S, 0>;
+
+ def : SME2_ZA_Tile_Vec_Multi_Single_Pat<NAME # _M2ZZ_S, !cast<SDPatternOperator>(op # "_2x1"), timm32_0_3, nxv4f32>;
// Single and multiple vectors
def _MZ2Z_S : sme2_fp32_quarter_tile_outer_product<1, 0, S, mnemonic, ZPR32Mul2_Lo, ZZ_s_mul_r_Hi>, SMEPseudo2Instr<NAME # _MZ2Z_S, 1>;
@@ -5859,7 +5915,11 @@ multiclass sme2_fmop4as_fp64_non_widening<bit S, string mnemonic, string op> {
def : SME2_ZA_Tile_TwoVec_Pat<NAME # _MZZ_D, !cast<SDPatternOperator>(op # "_1x1"), timm32_0_7, nxv2f64>;
// Multiple and single vectors
- def _M2ZZ_D : sme2_fp64_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_d_mul_r_Lo, ZPR64Mul2_Hi>;
+ def _M2ZZ_D : sme2_fp64_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_d_mul_r_Lo, ZPR64Mul2_Hi>, SMEPseudo2Instr<NAME # _M2ZZ_D, 1>;
+
+ def NAME # _M2ZZ_D # _PSEUDO : sme2_quarter_tile_outer_product_pseudo_multi_single<ZZ_d_mul_r_Lo, ZPR64Mul2_Hi, SMEMatrixTileD>, SMEPseudo2Instr<NAME # _M2ZZ_D, 0>;
+
+ def : SME2_ZA_Tile_Vec_Multi_Single_Pat<NAME # _M2ZZ_D, !cast<SDPatternOperator>(op # "_2x1"), timm32_0_7, nxv2f64>;
// Single and multiple vectors
def _MZ2Z_D : sme2_fp64_quarter_tile_outer_product<1, 0, S, mnemonic, ZPR64Mul2_Lo, ZZ_d_mul_r_Hi>, SMEPseudo2Instr<NAME # _MZ2Z_D, 1>;
@@ -5904,7 +5964,11 @@ multiclass sme2_fmop4as_fp16_fp32_widening<bit S, string mnemonic, string op> {
def : SME2_ZA_Tile_TwoVec_Pat<NAME # _MZZ_HtoS, !cast<SDPatternOperator>(op # "_1x1"), timm32_0_3, nxv8f16>;
// Multiple and single vectors
- def _M2ZZ_HtoS : sme2_fp16_fp32_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZPR16Mul2_Hi>;
+ def _M2ZZ_HtoS : sme2_fp16_fp32_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZPR16Mul2_Hi>, SMEPseudo2Instr<NAME # _M2ZZ_HtoS, 1>;
+
+ def NAME # _M2ZZ_HtoS # _PSEUDO : sme2_quarter_tile_outer_product_pseudo_multi_single<ZZ_h_mul_r_Lo, ZPR16Mul2_Hi, SMEMatrixTileS>, SMEPseudo2Instr<NAME # _M2ZZ_HtoS, 0>;
+
+ def : SME2_ZA_Tile_Vec_Multi_Single_Pat<NAME # _M2ZZ_HtoS, !cast<SDPatternOperator>(op # "_2x1"), timm32_0_3, nxv8f16>;
// Single and multiple vectors
def _MZ2Z_HtoS : sme2_fp16_fp32_quarter_tile_outer_product<1, 0, S, mnemonic, ZPR16Mul2_Lo, ZZ_h_mul_r_Hi>, SMEPseudo2Instr<NAME # _MZ2Z_HtoS, 1>;
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_2x1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_2x1.ll
new file mode 100644
index 0000000000000..ef1536fae6496
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_2x1.ll
@@ -0,0 +1,393 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -force-streaming -verify-machineinstrs < %s | FileCheck %s
+
+target triple = "aarch64-linux"
+
+; Widening
+define void @mop4a_za32_s8(<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm) #0 {
+; CHECK-LABEL: mop4a_za32_s8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z2.d
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: smop4a za0.s, { z0.b, z1.b }, z24.b
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.smop4a.wide.2x1.nxv16i8(i32 0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm)
+ ret void
+}
+
+define void @mop4s_za32_s8(<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm) #0 {
+; CHECK-LABEL: mop4s_za32_s8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z2.d
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: smop4s za0.s, { z0.b, z1.b }, z24.b
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.smop4s.wide.2x1.nxv16i8(i32 0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm)
+ ret void
+}
+
+define void @mop4a_za32_u8(<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm) #0 {
+; CHECK-LABEL: mop4a_za32_u8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z2.d
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: umop4a za0.s, { z0.b, z1.b }, z24.b
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.umop4a.wide.2x1.nxv16i8(i32 0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm)
+ ret void
+}
+
+define void @mop4s_za32_u8(<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm) #0 {
+; CHECK-LABEL: mop4s_za32_u8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z2.d
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: umop4s za0.s, { z0.b, z1.b }, z24.b
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.umop4s.wide.2x1.nxv16i8(i32 0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm)
+ ret void
+}
+
+define void @mop4a_za32_s8_u8(<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm) #0 {
+; CHECK-LABEL: mop4a_za32_s8_u8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z2.d
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: sumop4a za0.s, { z0.b, z1.b }, z24.b
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.sumop4a.wide.2x1.nxv16i8(i32 0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm)
+ ret void
+}
+
+define void @mop4s_za32_s8_u8(<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm) #0 {
+; CHECK-LABEL: mop4s_za32_s8_u8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z2.d
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: sumop4s za0.s, { z0.b, z1.b }, z24.b
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.sumop4s.wide.2x1.nxv16i8(i32 0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm)
+ ret void
+}
+
+define void @mop4a_za32_u8_s8(<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm) #0 {
+; CHECK-LABEL: mop4a_za32_u8_s8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z2.d
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: usmop4a za0.s, { z0.b, z1.b }, z24.b
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.usmop4a.wide.2x1.nxv16i8(i32 0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm)
+ ret void
+}
+
+define void @mop4s_za32_u8_s8(<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm) #0 {
+; CHECK-LABEL: mop4s_za32_u8_s8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z2.d
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: usmop4s za0.s, { z0.b, z1.b }, z24.b
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.usmop4s.wide.2x1.nxv16i8(i32 0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm)
+ ret void
+}
+
+
+define void @mop4a_za32_s16(<vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm) #0 {
+; CHECK-LABEL: mop4a_za32_s16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z2.d
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: smop4a za0.s, { z0.h, z1.h }, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.smop4a.wide.2x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm)
+ ret void
+}
+
+define void @mop4s_za32_s16(<vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm) #0 {
+; CHECK-LABEL: mop4s_za32_s16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z2.d
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: smop4s za0.s, { z0.h, z1.h }, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.smop4s.wide.2x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm)
+ ret void
+}
+
+define void @mop4a_za32_u16(<vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm) #0 {
+; CHECK-LABEL: mop4a_za32_u16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z2.d
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: umop4a za0.s, { z0.h, z1.h }, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.umop4a.wide.2x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm)
+ ret void
+}
+
+define void @mop4s_za32_u16(<vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm) #0 {
+; CHECK-LABEL: mop4s_za32_u16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z2.d
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: umop4s za0.s, { z0.h, z1.h }, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.umop4s.wide.2x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm)
+ ret void
+}
+
+define void @mop4a_za32_f16(<vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zm) #0 {
+; CHECK-LABEL: mop4a_za32_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z2.d
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: fmop4a za0.s, { z0.h, z1.h }, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4a.wide.2x1.nxv8f16(i32 0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zm)
+ ret void
+}
+
+define void @mop4s_za32_f16(<vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zm) #0 {
+; CHECK-LABEL: mop4s_za32_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z2.d
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: fmop4s za0.s, { z0.h, z1.h }, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4s.wide.2x1.nxv8f16(i32 0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zm)
+ ret void
+}
+
+define void @mop4a_za32_bf16(<vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zm) #0 {
+; CHECK-LABEL: mop4a_za32_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z2.d
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: bfmop4a za0.s, { z0.h, z1.h }, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4a.wide.2x1.nxv8bf16(i32 0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zm)
+ ret void
+}
+
+define void @mop4s_za32_bf16(<vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zm) #0 {
+; CHECK-LABEL: mop4s_za32_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z2.d
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: bfmop4s za0.s, { z0.h, z1.h }, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4s.wide.2x1.nxv8bf16(i32 0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zm)
+ ret void
+}
+
+define void @mop4a_za64_s16(<vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm) #0 {
+; CHECK-LABEL: mop4a_za64_s16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z2.d
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: smop4a za0.d, { z0.h, z1.h }, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.smop4a.za64.wide.2x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm)
+ ret void
+}
+
+define void @mop4s_za64_s16(<vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm) #0 {
+; CHECK-LABEL: mop4s_za64_s16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z2.d
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: smop4s za0.d, { z0.h, z1.h }, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.smop4s.za64.wide.2x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm)
+ ret void
+}
+
+define void @mop4a_za64_u16(<vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm) #0 {
+; CHECK-LABEL: mop4a_za64_u16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z2.d
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: umop4a za0.d, { z0.h, z1.h }, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.umop4a.za64.wide.2x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm)
+ ret void
+}
+
+define void @mop4s_za64_u16(<vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm) #0 {
+; CHECK-LABEL: mop4s_za64_u16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z2.d
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: umop4s za0.d, { z0.h, z1.h }, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.umop4s.za64.wide.2x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm)
+ ret void
+}
+
+define void @mop4a_za64_s16_u16(<vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm) #0 {
+; CHECK-LABEL: mop4a_za64_s16_u16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z2.d
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: sumop4a za0.d, { z0.h, z1.h }, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.sumop4a.za64.wide.2x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm)
+ ret void
+}
+
+define void @mop4s_za64_s16_u16(<vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm) #0 {
+; CHECK-LABEL: mop4s_za64_s16_u16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z2.d
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: sumop4s za0.d, { z0.h, z1.h }, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.sumop4s.za64.wide.2x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm)
+ ret void
+}
+
+define void @mop4a_za64_u16_s16(<vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm) #0 {
+; CHECK-LABEL: mop4a_za64_u16_s16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z2.d
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: usmop4a za0.d, { z0.h, z1.h }, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.usmop4a.za64.wide.2x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm)
+ ret void
+}
+
+define void @mop4s_za64_u16_s16(<vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm) #0 {
+; CHECK-LABEL: mop4s_za64_u16_s16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z2.d
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: usmop4s za0.d, { z0.h, z1.h }, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.usmop4s.za64.wide.2x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm)
+ ret void
+}
+
+; Non-widening
+define void @mop4a_za16_f16(<vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zm) #0 {
+; CHECK-LABEL: mop4a_za16_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z2.d
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: fmop4a za0.h, { z0.h, z1.h }, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4a.2x1.nxv8f16(i32 0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zm)
+ ret void
+}
+
+define void @mop4s_za16_f16(<vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zm) #0 {
+; CHECK-LABEL: mop4s_za16_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z2.d
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: fmop4s za0.h, { z0.h, z1.h }, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4s.2x1.nxv8f16(i32 0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zm)
+ ret void
+}
+
+define void @mop4a_za32_f32(<vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zm) #0 {
+; CHECK-LABEL: mop4a_za32_f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z2.d
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: fmop4a za0.s, { z0.s, z1.s }, z24.s
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4a.2x1.nxv4f32(i32 0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zm)
+ ret void
+}
+
+define void @mop4s_za32_f32(<vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zm) #0 {
+; CHECK-LABEL: mop4s_za32_f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z2.d
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: fmop4s za0.s, { z0.s, z1.s }, z24.s
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4s.2x1.nxv4f32(i32 0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zm)
+ ret void
+}
+
+define void @mop4a_za64_f64(<vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zm) #0 {
+; CHECK-LABEL: mop4a_za64_f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z2.d
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: fmop4a za0.d, { z0.d, z1.d }, z24.d
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4a.2x1.nxv2f64(i32 0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zm)
+ ret void
+}
+
+define void @mop4s_za64_f64(<vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zm) #0 {
+; CHECK-LABEL: mop4s_za64_f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z2.d
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: fmop4s za0.d, { z0.d, z1.d }, z24.d
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4s.2x1.nxv2f64(i32 0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zm)
+ ret void
+}
+
+define void @mop4a_za16_bf16(<vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zm) #0 {
+; CHECK-LABEL: mop4a_za16_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z2.d
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: bfmop4a za0.h, { z0.h, z1.h }, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4a.2x1.nxv8bf16(i32 0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zm)
+ ret void
+}
+
+define void @mop4s_za16_bf16(<vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zm) #0 {
+; CHECK-LABEL: mop4s_za16_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z2.d
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: bfmop4s za0.h, { z0.h, z1.h }, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4s.2x1.nxv8bf16(i32 0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zm)
+ ret void
+}
+
+attributes #0 = {nounwind "target-features" = "+sme-i16i64,+sme-f64f64,+sme-b16b16,+sme2p1,+bf16,+sme-f16f16,+sme-mop4" }
More information about the llvm-commits
mailing list