[clang] [llvm] [AArch64] Add intrinsics for 16-bit non-widening FMLA/FMLS (PR #88553)
Momchil Velikov via cfe-commits
cfe-commits at lists.llvm.org
Thu May 9 08:48:19 PDT 2024
https://github.com/momchil-velikov updated https://github.com/llvm/llvm-project/pull/88553
>From 2f96bd7ec0186ce9f1f2f5b1d00bd63bb677a3d3 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Fri, 12 Apr 2024 19:35:47 +0100
Subject: [PATCH 1/3] [AArch64] Add intrinsics for 16-bit non-widening
FMLA/FMLS
---
clang/include/clang/Basic/arm_sme.td | 34 +
.../acle_sme2_fmlas16.c | 592 ++++++++++++++++++
.../acle_sme2_fmlas16.c | 90 +++
.../lib/Target/AArch64/AArch64SMEInstrInfo.td | 59 +-
llvm/lib/Target/AArch64/SMEInstrFormats.td | 54 +-
.../AArch64/sme2-intrinsics-fmlas16.ll | 462 ++++++++++++++
6 files changed, 1254 insertions(+), 37 deletions(-)
create mode 100644 clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c
create mode 100644 clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c
create mode 100644 llvm/test/CodeGen/AArch64/sme2-intrinsics-fmlas16.ll
diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index 1ac6d5170ea28..29bc627e7481a 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -458,6 +458,40 @@ let TargetGuard = "sme2,sme-f64f64" in {
def SVMLS_LANE_VG1x4_F64 : Inst<"svmls_lane_za64[_{d}]_vg1x4", "vm4di", "d", MergeNone, "aarch64_sme_fmls_lane_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_1>]>;
}
+let TargetGuard = "sme2p1,sme-f16f16" in {
+ def SVMLA_MULTI_VG1x2_F16 : Inst<"svmla_za16[_f16]_vg1x2", "vm22", "h", MergeNone, "aarch64_sme_fmla_vg1x2", [IsStreaming, IsInOutZA], []>;
+ def SVMLA_MULTI_VG1x4_F16 : Inst<"svmla_za16[_f16]_vg1x4", "vm44", "h", MergeNone, "aarch64_sme_fmla_vg1x4", [IsStreaming, IsInOutZA], []>;
+ def SVMLS_MULTI_VG1x2_F16 : Inst<"svmls_za16[_f16]_vg1x2", "vm22", "h", MergeNone, "aarch64_sme_fmls_vg1x2", [IsStreaming, IsInOutZA], []>;
+ def SVMLS_MULTI_VG1x4_F16 : Inst<"svmls_za16[_f16]_vg1x4", "vm44", "h", MergeNone, "aarch64_sme_fmls_vg1x4", [IsStreaming, IsInOutZA], []>;
+
+ def SVMLA_SINGLE_VG1x2_F16 : Inst<"svmla[_single]_za16[_f16]_vg1x2", "vm2d", "h", MergeNone, "aarch64_sme_fmla_single_vg1x2", [IsStreaming, IsInOutZA], []>;
+ def SVMLA_SINGLE_VG1x4_F16 : Inst<"svmla[_single]_za16[_f16]_vg1x4", "vm4d", "h", MergeNone, "aarch64_sme_fmla_single_vg1x4", [IsStreaming, IsInOutZA], []>;
+ def SVMLS_SINGLE_VG1x2_F16 : Inst<"svmls[_single]_za16[_f16]_vg1x2", "vm2d", "h", MergeNone, "aarch64_sme_fmls_single_vg1x2", [IsStreaming, IsInOutZA], []>;
+ def SVMLS_SINGLE_VG1x4_F16 : Inst<"svmls[_single]_za16[_f16]_vg1x4", "vm4d", "h", MergeNone, "aarch64_sme_fmls_single_vg1x4", [IsStreaming, IsInOutZA], []>;
+
+ def SVMLA_LANE_VG1x2_F16 : Inst<"svmla_lane_za16[_f16]_vg1x2", "vm2di", "h", MergeNone, "aarch64_sme_fmla_lane_vg1x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+ def SVMLA_LANE_VG1x4_F16 : Inst<"svmla_lane_za16[_f16]_vg1x4", "vm4di", "h", MergeNone, "aarch64_sme_fmla_lane_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+ def SVMLS_LANE_VG1x2_F16 : Inst<"svmls_lane_za16[_f16]_vg1x2", "vm2di", "h", MergeNone, "aarch64_sme_fmls_lane_vg1x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+ def SVMLS_LANE_VG1x4_F16 : Inst<"svmls_lane_za16[_f16]_vg1x4", "vm4di", "h", MergeNone, "aarch64_sme_fmls_lane_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+}
+
+let TargetGuard = "sme2,b16b16" in {
+ def SVMLA_MULTI_VG1x2_BF16 : Inst<"svmla_za16[_bf16]_vg1x2", "vm22", "b", MergeNone, "aarch64_sme_fmla_vg1x2", [IsStreaming, IsInOutZA], []>;
+ def SVMLA_MULTI_VG1x4_BF16 : Inst<"svmla_za16[_bf16]_vg1x4", "vm44", "b", MergeNone, "aarch64_sme_fmla_vg1x4", [IsStreaming, IsInOutZA], []>;
+ def SVMLS_MULTI_VG1x2_BF16 : Inst<"svmls_za16[_bf16]_vg1x2", "vm22", "b", MergeNone, "aarch64_sme_fmls_vg1x2", [IsStreaming, IsInOutZA], []>;
+ def SVMLS_MULTI_VG1x4_BF16 : Inst<"svmls_za16[_bf16]_vg1x4", "vm44", "b", MergeNone, "aarch64_sme_fmls_vg1x4", [IsStreaming, IsInOutZA], []>;
+
+ def SVMLA_SINGLE_VG1x2_BF16 : Inst<"svmla[_single]_za16[_bf16]_vg1x2", "vm2d", "b", MergeNone, "aarch64_sme_fmla_single_vg1x2", [IsStreaming, IsInOutZA], []>;
+ def SVMLA_SINGLE_VG1x4_BF16 : Inst<"svmla[_single]_za16[_bf16]_vg1x4", "vm4d", "b", MergeNone, "aarch64_sme_fmla_single_vg1x4", [IsStreaming, IsInOutZA], []>;
+ def SVMLS_SINGLE_VG1x2_BF16 : Inst<"svmls[_single]_za16[_bf16]_vg1x2", "vm2d", "b", MergeNone, "aarch64_sme_fmls_single_vg1x2", [IsStreaming, IsInOutZA], []>;
+ def SVMLS_SINGLE_VG1x4_BF16 : Inst<"svmls[_single]_za16[_bf16]_vg1x4", "vm4d", "b", MergeNone, "aarch64_sme_fmls_single_vg1x4", [IsStreaming, IsInOutZA], []>;
+
+ def SVMLA_LANE_VG1x2_BF16 : Inst<"svmla_lane_za16[_bf16]_vg1x2", "vm2di", "b", MergeNone, "aarch64_sme_fmla_lane_vg1x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+ def SVMLA_LANE_VG1x4_BF16 : Inst<"svmla_lane_za16[_bf16]_vg1x4", "vm4di", "b", MergeNone, "aarch64_sme_fmla_lane_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+ def SVMLS_LANE_VG1x2_BF16 : Inst<"svmls_lane_za16[_bf16]_vg1x2", "vm2di", "b", MergeNone, "aarch64_sme_fmls_lane_vg1x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+ def SVMLS_LANE_VG1x4_BF16 : Inst<"svmls_lane_za16[_bf16]_vg1x4", "vm4di", "b", MergeNone, "aarch64_sme_fmls_lane_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+}
+
// FMLAL/FMLSL/UMLAL/SMLAL
// SMLALL/UMLALL/USMLALL/SUMLALL
let TargetGuard = "sme2" in {
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c
new file mode 100644
index 0000000000000..81ab3e62276b2
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c
@@ -0,0 +1,592 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +sme-f16f16 -target-feature +b16b16 -O2 -S -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -x c++ -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +sme-f16f16 -target-feature +b16b16 -O2 -S -Werror -Wall -emit-llvm -o - %s | FileCheck %s --check-prefix CHECK-CXX
+// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +sme-f16f16 -target-feature +b16b16 -O2 -S -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -x c++ -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +sme-f16f16 -target-feature +b16b16 -O2 -S -Werror -Wall -emit-llvm -o - %s | FileCheck %s --check-prefix CHECK-CXX
+
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +sme-f16f16 -target-feature +b16b16 -O2 -S -Werror -Wall %s -o /dev/null
+
+// REQUIRES: aarch64-registered-target
+#include <arm_sme.h>
+
+#ifdef SME_OVERLOADED_FORMS
+#define SME_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED, A5) A1##A3##A5
+#else
+#define SME_ACLE_FUNC(A1, A2, A3, A4, A5) A1##A2##A3##A4##A5
+#endif
+
+// CHECK-LABEL: define dso_local void @test_svmla_single_za16_f16_vg1x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM]])
+// CHECK-NEXT: ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z32test_svmla_single_za16_f16_vg1x2j13svfloat16x2_tu13__SVFloat16_t(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-CXX-NEXT: entry:
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 0)
+// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM]])
+// CHECK-CXX-NEXT: ret void
+//
+void test_svmla_single_za16_f16_vg1x2(uint32_t slice, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmla,_single,_za16,_f16,_vg1x2)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmla_single_za16_f16_vg1x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM]])
+// CHECK-NEXT: ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z32test_svmla_single_za16_f16_vg1x4j13svfloat16x4_tu13__SVFloat16_t(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT: entry:
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 0)
+// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-CXX-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-CXX-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM]])
+// CHECK-CXX-NEXT: ret void
+//
+void test_svmla_single_za16_f16_vg1x4(uint32_t slice, svfloat16x4_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmla,_single,_za16,_f16,_vg1x4)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmls_single_za16_f16_vg1x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM]])
+// CHECK-NEXT: ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z32test_svmls_single_za16_f16_vg1x2j13svfloat16x2_tu13__SVFloat16_t(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT: entry:
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 0)
+// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM]])
+// CHECK-CXX-NEXT: ret void
+//
+void test_svmls_single_za16_f16_vg1x2(uint32_t slice, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmls,_single,_za16,_f16,_vg1x2)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmls_single_za16_f16_vg1x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM]])
+// CHECK-NEXT: ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z32test_svmls_single_za16_f16_vg1x4j13svfloat16x4_tu13__SVFloat16_t(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT: entry:
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 0)
+// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-CXX-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-CXX-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM]])
+// CHECK-CXX-NEXT: ret void
+//
+void test_svmls_single_za16_f16_vg1x4(uint32_t slice, svfloat16x4_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmls,_single,_za16,_f16,_vg1x4)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmla_za16_f16_vg1x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x half> [[ZN:%.*]], <vscale x 16 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM]], i64 8)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.vg1x2.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]])
+// CHECK-NEXT: ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z25test_svmla_za16_f16_vg1x2j13svfloat16x2_tS_(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x half> [[ZN:%.*]], <vscale x 16 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT: entry:
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 0)
+// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-CXX-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM]], i64 0)
+// CHECK-CXX-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM]], i64 8)
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.fmla.vg1x2.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]])
+// CHECK-CXX-NEXT: ret void
+//
+void test_svmla_za16_f16_vg1x2(uint32_t slice, svfloat16x2_t zn, svfloat16x2_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmla,,_za16,_f16,_vg1x2)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmla_za16_f16_vg1x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x half> [[ZN:%.*]], <vscale x 32 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 0)
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 8)
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 16)
+// CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 24)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.vg1x4.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]])
+// CHECK-NEXT: ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z25test_svmla_za16_f16_vg1x4j13svfloat16x4_tS_(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x half> [[ZN:%.*]], <vscale x 32 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT: entry:
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 0)
+// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-CXX-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-CXX-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-CXX-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 0)
+// CHECK-CXX-NEXT: [[TMP5:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 8)
+// CHECK-CXX-NEXT: [[TMP6:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 16)
+// CHECK-CXX-NEXT: [[TMP7:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 24)
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.fmla.vg1x4.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]])
+// CHECK-CXX-NEXT: ret void
+//
+void test_svmla_za16_f16_vg1x4(uint32_t slice, svfloat16x4_t zn, svfloat16x4_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmla,,_za16,_f16,_vg1x4)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmls_za16_f16_vg1x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x half> [[ZN:%.*]], <vscale x 16 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM]], i64 8)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.vg1x2.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]])
+// CHECK-NEXT: ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z25test_svmls_za16_f16_vg1x2j13svfloat16x2_tS_(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x half> [[ZN:%.*]], <vscale x 16 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT: entry:
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 0)
+// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-CXX-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM]], i64 0)
+// CHECK-CXX-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM]], i64 8)
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.fmls.vg1x2.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]])
+// CHECK-CXX-NEXT: ret void
+//
+void test_svmls_za16_f16_vg1x2(uint32_t slice, svfloat16x2_t zn, svfloat16x2_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmls,,_za16,_f16,_vg1x2)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmls_za16_f16_vg1x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x half> [[ZN:%.*]], <vscale x 32 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 0)
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 8)
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 16)
+// CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 24)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.vg1x4.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]])
+// CHECK-NEXT: ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z25test_svmls_za16_f16_vg1x4j13svfloat16x4_tS_(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x half> [[ZN:%.*]], <vscale x 32 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT: entry:
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 0)
+// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-CXX-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-CXX-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-CXX-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 0)
+// CHECK-CXX-NEXT: [[TMP5:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 8)
+// CHECK-CXX-NEXT: [[TMP6:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 16)
+// CHECK-CXX-NEXT: [[TMP7:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 24)
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.fmls.vg1x4.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]])
+// CHECK-CXX-NEXT: ret void
+//
+void test_svmls_za16_f16_vg1x4(uint32_t slice, svfloat16x4_t zn, svfloat16x4_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmls,,_za16,_f16,_vg1x4)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmla_lane_za16_f16_vg1x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM]], i32 7)
+// CHECK-NEXT: ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z30test_svmla_lane_za16_f16_vg1x2j13svfloat16x2_tu13__SVFloat16_t(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT: entry:
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 0)
+// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM]], i32 7)
+// CHECK-CXX-NEXT: ret void
+//
+void test_svmla_lane_za16_f16_vg1x2(uint32_t slice, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmla_lane,,_za16,_f16,_vg1x2)(slice, zn, zm, 7);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmla_lane_za16_f16_vg1x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM]], i32 7)
+// CHECK-NEXT: ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z30test_svmla_lane_za16_f16_vg1x4j13svfloat16x4_tu13__SVFloat16_t(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT: entry:
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 0)
+// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-CXX-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-CXX-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM]], i32 7)
+// CHECK-CXX-NEXT: ret void
+//
+void test_svmla_lane_za16_f16_vg1x4(uint32_t slice, svfloat16x4_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmla_lane,,_za16,_f16,_vg1x4)(slice, zn, zm, 7);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmls_lane_za16_f16_vg1x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM]], i32 7)
+// CHECK-NEXT: ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z30test_svmls_lane_za16_f16_vg1x2j13svfloat16x2_tu13__SVFloat16_t(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT: entry:
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 0)
+// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM]], i32 7)
+// CHECK-CXX-NEXT: ret void
+//
+void test_svmls_lane_za16_f16_vg1x2(uint32_t slice, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmls_lane,,_za16,_f16,_vg1x2)(slice, zn, zm, 7);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmls_lane_za16_f16_vg1x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM]], i32 7)
+// CHECK-NEXT: ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z30test_svmls_lane_za16_f16_vg1x4j13svfloat16x4_tu13__SVFloat16_t(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT: entry:
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 0)
+// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-CXX-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-CXX-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM]], i32 7)
+// CHECK-CXX-NEXT: ret void
+//
+void test_svmls_lane_za16_f16_vg1x4(uint32_t slice, svfloat16x4_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmls_lane,,_za16,_f16,_vg1x4)(slice, zn, zm, 7);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmla_single_za16_bf16_vg1x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM]])
+// CHECK-NEXT: ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z33test_svmla_single_za16_bf16_vg1x2j14svbfloat16x2_tu14__SVBfloat16_t(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT: entry:
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 0)
+// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM]])
+// CHECK-CXX-NEXT: ret void
+//
+void test_svmla_single_za16_bf16_vg1x2(uint32_t slice, svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmla, _single, _za16, _bf16, _vg1x2)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmla_single_za16_bf16_vg1x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM]])
+// CHECK-NEXT: ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z33test_svmla_single_za16_bf16_vg1x4j14svbfloat16x4_tu14__SVBfloat16_t(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT: entry:
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 0)
+// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-CXX-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-CXX-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM]])
+// CHECK-CXX-NEXT: ret void
+//
+void test_svmla_single_za16_bf16_vg1x4(uint32_t slice, svbfloat16x4_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmla, _single, _za16, _bf16, _vg1x4)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmls_single_za16_bf16_vg1x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM]])
+// CHECK-NEXT: ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z33test_svmls_single_za16_bf16_vg1x2j14svbfloat16x2_tu14__SVBfloat16_t(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT: entry:
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 0)
+// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM]])
+// CHECK-CXX-NEXT: ret void
+//
+void test_svmls_single_za16_bf16_vg1x2(uint32_t slice, svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmls, _single, _za16, _bf16, _vg1x2)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmls_single_za16_bf16_vg1x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM]])
+// CHECK-NEXT: ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z33test_svmls_single_za16_bf16_vg1x4j14svbfloat16x4_tu14__SVBfloat16_t(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT: entry:
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 0)
+// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-CXX-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-CXX-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM]])
+// CHECK-CXX-NEXT: ret void
+//
+void test_svmls_single_za16_bf16_vg1x4(uint32_t slice, svbfloat16x4_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmls, _single, _za16, _bf16, _vg1x4)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmla_za16_bf16_vg1x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x bfloat> [[ZN:%.*]], <vscale x 16 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM]], i64 8)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.vg1x2.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]])
+// CHECK-NEXT: ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z26test_svmla_za16_bf16_vg1x2j14svbfloat16x2_tS_(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x bfloat> [[ZN:%.*]], <vscale x 16 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT: entry:
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 0)
+// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-CXX-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM]], i64 0)
+// CHECK-CXX-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM]], i64 8)
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.fmla.vg1x2.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]])
+// CHECK-CXX-NEXT: ret void
+//
+void test_svmla_za16_bf16_vg1x2(uint32_t slice, svbfloat16x2_t zn, svbfloat16x2_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmla, , _za16, _bf16, _vg1x2)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmla_za16_bf16_vg1x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x bfloat> [[ZN:%.*]], <vscale x 32 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 0)
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 8)
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 16)
+// CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 24)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.vg1x4.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]])
+// CHECK-NEXT: ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z26test_svmla_za16_bf16_vg1x4j14svbfloat16x4_tS_(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x bfloat> [[ZN:%.*]], <vscale x 32 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT: entry:
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 0)
+// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-CXX-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-CXX-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-CXX-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 0)
+// CHECK-CXX-NEXT: [[TMP5:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 8)
+// CHECK-CXX-NEXT: [[TMP6:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 16)
+// CHECK-CXX-NEXT: [[TMP7:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 24)
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.fmla.vg1x4.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]])
+// CHECK-CXX-NEXT: ret void
+//
+void test_svmla_za16_bf16_vg1x4(uint32_t slice, svbfloat16x4_t zn, svbfloat16x4_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmla, , _za16, _bf16, _vg1x4)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmls_za16_bf16_vg1x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x bfloat> [[ZN:%.*]], <vscale x 16 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM]], i64 8)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.vg1x2.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]])
+// CHECK-NEXT: ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z26test_svmls_za16_bf16_vg1x2j14svbfloat16x2_tS_(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x bfloat> [[ZN:%.*]], <vscale x 16 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT: entry:
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 0)
+// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-CXX-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM]], i64 0)
+// CHECK-CXX-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM]], i64 8)
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.fmls.vg1x2.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]])
+// CHECK-CXX-NEXT: ret void
+//
+void test_svmls_za16_bf16_vg1x2(uint32_t slice, svbfloat16x2_t zn, svbfloat16x2_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmls, , _za16, _bf16, _vg1x2)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmls_za16_bf16_vg1x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x bfloat> [[ZN:%.*]], <vscale x 32 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 0)
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 8)
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 16)
+// CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 24)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.vg1x4.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]])
+// CHECK-NEXT: ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z26test_svmls_za16_bf16_vg1x4j14svbfloat16x4_tS_(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x bfloat> [[ZN:%.*]], <vscale x 32 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT: entry:
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 0)
+// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-CXX-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-CXX-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-CXX-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 0)
+// CHECK-CXX-NEXT: [[TMP5:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 8)
+// CHECK-CXX-NEXT: [[TMP6:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 16)
+// CHECK-CXX-NEXT: [[TMP7:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 24)
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.fmls.vg1x4.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]])
+// CHECK-CXX-NEXT: ret void
+//
+void test_svmls_za16_bf16_vg1x4(uint32_t slice, svbfloat16x4_t zn, svbfloat16x4_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmls, , _za16, _bf16, _vg1x4)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmla_lane_za16_bf16_vg1x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM]], i32 7)
+// CHECK-NEXT: ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z31test_svmla_lane_za16_bf16_vg1x2j14svbfloat16x2_tu14__SVBfloat16_t(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT: entry:
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 0)
+// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM]], i32 7)
+// CHECK-CXX-NEXT: ret void
+//
+void test_svmla_lane_za16_bf16_vg1x2(uint32_t slice, svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmla_lane, , _za16, _bf16, _vg1x2)(slice, zn, zm, 7);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmla_lane_za16_bf16_vg1x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM]], i32 7)
+// CHECK-NEXT: ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z31test_svmla_lane_za16_bf16_vg1x4j14svbfloat16x4_tu14__SVBfloat16_t(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT: entry:
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 0)
+// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-CXX-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-CXX-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM]], i32 7)
+// CHECK-CXX-NEXT: ret void
+//
+void test_svmla_lane_za16_bf16_vg1x4(uint32_t slice, svbfloat16x4_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmla_lane, , _za16, _bf16, _vg1x4)(slice, zn, zm, 7);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmls_lane_za16_bf16_vg1x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM]], i32 7)
+// CHECK-NEXT: ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z31test_svmls_lane_za16_bf16_vg1x2j14svbfloat16x2_tu14__SVBfloat16_t(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT: entry:
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 0)
+// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM]], i32 7)
+// CHECK-CXX-NEXT: ret void
+//
+void test_svmls_lane_za16_bf16_vg1x2(uint32_t slice, svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmls_lane, , _za16, _bf16, _vg1x2)(slice, zn, zm, 7);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmls_lane_za16_bf16_vg1x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM]], i32 7)
+// CHECK-NEXT: ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z31test_svmls_lane_za16_bf16_vg1x4j14svbfloat16x4_tu14__SVBfloat16_t(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT: entry:
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 0)
+// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-CXX-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-CXX-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM]], i32 7)
+// CHECK-CXX-NEXT: ret void
+//
+void test_svmls_lane_za16_bf16_vg1x4(uint32_t slice, svbfloat16x4_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmls_lane, , _za16, _bf16, _vg1x4)(slice, zn, zm, 7);
+}
diff --git a/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c
new file mode 100644
index 0000000000000..c6cf8a518fa79
--- /dev/null
+++ b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c
@@ -0,0 +1,90 @@
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -verify -S -emit-llvm %s
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_sme.h>
+
+
+void test_features_f16f16(uint32_t slice,
+ svfloat16_t zm,
+ svfloat16x2_t zn2, svfloat16x2_t zm2,
+ svfloat16x4_t zn4, svfloat16x4_t zm4,
+ svbfloat16_t bzm,
+ svbfloat16x2_t bzn2, svbfloat16x2_t bzm2,
+ svbfloat16x4_t bzn4, svbfloat16x4_t bzm4)
+
+ __arm_streaming __arm_inout("za") {
+ // expected-error at +1 {{'svmla_single_za16_f16_vg1x2' needs target feature sme2p1,sme-f16f16}}
+ svmla_single_za16_f16_vg1x2(slice, zn2, zm);
+ // expected-error at +1 {{'svmla_single_za16_f16_vg1x4' needs target feature sme2p1,sme-f16f16}}
+ svmla_single_za16_f16_vg1x4(slice, zn4, zm);
+ // expected-error at +1 {{'svmls_single_za16_f16_vg1x2' needs target feature sme2p1,sme-f16f16}}
+ svmls_single_za16_f16_vg1x2(slice, zn2, zm);
+ // expected-error at +1 {{'svmls_single_za16_f16_vg1x4' needs target feature sme2p1,sme-f16f16}}
+ svmls_single_za16_f16_vg1x4(slice, zn4, zm);
+ // expected-error at +1 {{'svmla_za16_f16_vg1x2' needs target feature sme2p1,sme-f16f16}}
+ svmla_za16_f16_vg1x2(slice, zn2, zm2);
+ // expected-error at +1 {{'svmla_za16_f16_vg1x4' needs target feature sme2p1,sme-f16f16}}
+ svmla_za16_f16_vg1x4(slice, zn4, zm4);
+ // expected-error at +1 {{'svmls_za16_f16_vg1x2' needs target feature sme2p1,sme-f16f16}}
+ svmls_za16_f16_vg1x2(slice, zn2, zm2);
+ // expected-error at +1 {{'svmls_za16_f16_vg1x4' needs target feature sme2p1,sme-f16f16}}
+ svmls_za16_f16_vg1x4(slice, zn4, zm4);
+ // expected-error at +1 {{'svmla_lane_za16_f16_vg1x2' needs target feature sme2p1,sme-f16f16}}
+ svmla_lane_za16_f16_vg1x2(slice, zn2, zm, 7);
+ // expected-error at +1 {{'svmla_lane_za16_f16_vg1x4' needs target feature sme2p1,sme-f16f16}}
+ svmla_lane_za16_f16_vg1x4(slice, zn4, zm, 7);
+ // expected-error at +1 {{'svmls_lane_za16_f16_vg1x2' needs target feature sme2p1,sme-f16f16}}
+ svmls_lane_za16_f16_vg1x2(slice, zn2, zm, 7);
+ // expected-error at +1 {{'svmls_lane_za16_f16_vg1x4' needs target feature sme2p1,sme-f16f16}}
+ svmls_lane_za16_f16_vg1x4(slice, zn4, zm, 7);
+
+ // expected-error at +1 {{'svmla_single_za16_bf16_vg1x2' needs target feature sme2,b16b16}}
+ svmla_single_za16_bf16_vg1x2(slice, bzn2, bzm);
+ // expected-error at +1 {{'svmla_single_za16_bf16_vg1x4' needs target feature sme2,b16b16}}
+ svmla_single_za16_bf16_vg1x4(slice, bzn4, bzm);
+ // expected-error at +1 {{'svmls_single_za16_bf16_vg1x2' needs target feature sme2,b16b16}}
+ svmls_single_za16_bf16_vg1x2(slice, bzn2, bzm);
+ // expected-error at +1 {{'svmls_single_za16_bf16_vg1x4' needs target feature sme2,b16b16}}
+ svmls_single_za16_bf16_vg1x4(slice, bzn4, bzm);
+ // expected-error at +1 {{'svmla_za16_bf16_vg1x2' needs target feature sme2,b16b16}}
+ svmla_za16_bf16_vg1x2(slice, bzn2, bzm2);
+ // expected-error at +1 {{'svmla_za16_bf16_vg1x4' needs target feature sme2,b16b16}}
+ svmla_za16_bf16_vg1x4(slice, bzn4, bzm4);
+ // expected-error at +1 {{'svmls_za16_bf16_vg1x2' needs target feature sme2,b16b16}}
+ svmls_za16_bf16_vg1x2(slice, bzn2, bzm2);
+ // expected-error at +1 {{'svmls_za16_bf16_vg1x4' needs target feature sme2,b16b16}}
+ svmls_za16_bf16_vg1x4(slice, bzn4, bzm4);
+ // expected-error at +1 {{'svmla_lane_za16_bf16_vg1x2' needs target feature sme2,b16b16}}
+ svmla_lane_za16_bf16_vg1x2(slice, bzn2, bzm, 7);
+ // expected-error at +1 {{'svmla_lane_za16_bf16_vg1x4' needs target feature sme2,b16b16}}
+ svmla_lane_za16_bf16_vg1x4(slice, bzn4, bzm, 7);
+ // expected-error at +1 {{'svmls_lane_za16_bf16_vg1x2' needs target feature sme2,b16b16}}
+ svmls_lane_za16_bf16_vg1x2(slice, bzn2, bzm, 7);
+ // expected-error at +1 {{'svmls_lane_za16_bf16_vg1x4' needs target feature sme2,b16b16}}
+ svmls_lane_za16_bf16_vg1x4(slice, bzn4, bzm, 7);
+}
+
+
+void test_imm(uint32_t slice, svfloat16_t zm, svfloat16x2_t zn2,svfloat16x4_t zn4,
+ svbfloat16_t bzm, svbfloat16x2_t bzn2, svbfloat16x4_t bzn4)
+ __arm_streaming __arm_inout("za") {
+
+ // expected-error at +1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}}
+ svmla_lane_za16_f16_vg1x2(slice, zn2, zm, -1);
+ // expected-error at +1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}}
+ svmla_lane_za16_f16_vg1x4(slice, zn4, zm, -1);
+ // expected-error at +1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}}
+ svmls_lane_za16_f16_vg1x2(slice, zn2, zm, -1);
+ // expected-error at +1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}}
+ svmls_lane_za16_f16_vg1x4(slice, zn4, zm, -1);
+
+ // expected-error at +1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}}
+ svmla_lane_za16_bf16_vg1x2(slice, bzn2, bzm, -1);
+ // expected-error at +1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}}
+ svmla_lane_za16_bf16_vg1x4(slice, bzn4, bzm, -1);
+ // expected-error at +1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}}
+ svmls_lane_za16_bf16_vg1x2(slice, bzn2, bzm, -1);
+ // expected-error at +1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}}
+ svmls_lane_za16_bf16_vg1x4(slice, bzn4, bzm, -1);
+}
\ No newline at end of file
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 574178c8d5244..a2e8c530c1dff 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -797,22 +797,20 @@ defm FADD_VG2_M2Z_H : sme2_multivec_accum_add_sub_vg2<"fadd", 0b0100, MatrixOp16
defm FADD_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"fadd", 0b0100, MatrixOp16, ZZZZ_h_mul_r, nxv8f16, null_frag>;
defm FSUB_VG2_M2Z_H : sme2_multivec_accum_add_sub_vg2<"fsub", 0b0101, MatrixOp16, ZZ_h_mul_r, nxv8f16, null_frag>;
defm FSUB_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"fsub", 0b0101, MatrixOp16, ZZZZ_h_mul_r, nxv8f16, null_frag>;
-}
-let Predicates = [HasSMEF16F16] in {
-defm FMLA_VG2_M2ZZI_H : sme2p1_multi_vec_array_vg2_index_16b<"fmla", 0b00, 0b100, ZZ_h_mul_r, ZPR4b16>;
-defm FMLA_VG4_M4ZZI_H : sme2p1_multi_vec_array_vg4_index_16b<"fmla", 0b000, ZZZZ_h_mul_r, ZPR4b16>;
-defm FMLA_VG2_M2ZZ_H : sme2_dot_mla_add_sub_array_vg24_single<"fmla", 0b0011100, MatrixOp16, ZZ_h, ZPR4b16>;
-defm FMLA_VG4_M4ZZ_H : sme2_dot_mla_add_sub_array_vg24_single<"fmla", 0b0111100, MatrixOp16, ZZZZ_h, ZPR4b16>;
-defm FMLA_VG2_M2Z4Z_H : sme2_dot_mla_add_sub_array_vg2_multi<"fmla", 0b0100001, MatrixOp16, ZZ_h_mul_r, nxv8f16, null_frag>;
-defm FMLA_VG4_M4Z4Z_H : sme2_dot_mla_add_sub_array_vg4_multi<"fmla", 0b0100001, MatrixOp16, ZZZZ_h_mul_r, nxv8f16, null_frag>;
-
-defm FMLS_VG2_M2ZZI_H : sme2p1_multi_vec_array_vg2_index_16b<"fmls", 0b00, 0b101, ZZ_h_mul_r, ZPR4b16>;
-defm FMLS_VG4_M4ZZI_H : sme2p1_multi_vec_array_vg4_index_16b<"fmls", 0b001, ZZZZ_h_mul_r, ZPR4b16>;
-defm FMLS_VG2_M2ZZ_H : sme2_dot_mla_add_sub_array_vg24_single<"fmls", 0b0011101, MatrixOp16, ZZ_h, ZPR4b16>;
-defm FMLS_VG4_M4ZZ_H : sme2_dot_mla_add_sub_array_vg24_single<"fmls", 0b0111101, MatrixOp16, ZZZZ_h, ZPR4b16>;
-defm FMLS_VG2_M2Z2Z_H : sme2_dot_mla_add_sub_array_vg2_multi<"fmls", 0b0100011, MatrixOp16, ZZ_h_mul_r, nxv8f16, null_frag>;
-defm FMLS_VG4_M4Z2Z_H : sme2_dot_mla_add_sub_array_vg4_multi<"fmls", 0b0100011, MatrixOp16, ZZZZ_h_mul_r, nxv8f16, null_frag>;
+defm FMLA_VG2_M2ZZI_H : sme2p1_multi_vec_array_vg2_index_16b<"fmla", 0b00, 0b100, ZZ_h_mul_r, ZPR4b16, nxv8f16, int_aarch64_sme_fmla_lane_vg1x2>;
+defm FMLA_VG4_M4ZZI_H : sme2p1_multi_vec_array_vg4_index_16b<"fmla", 0b000, ZZZZ_h_mul_r, ZPR4b16, nxv8f16, int_aarch64_sme_fmla_lane_vg1x4>;
+defm FMLA_VG2_M2ZZ_H : sme2_dot_mla_add_sub_array_vg2_single<"fmla", 0b0011100, MatrixOp16, ZZ_h, ZPR4b16, nxv8f16, int_aarch64_sme_fmla_single_vg1x2>;
+defm FMLA_VG4_M4ZZ_H : sme2_dot_mla_add_sub_array_vg4_single<"fmla", 0b0111100, MatrixOp16, ZZZZ_h, ZPR4b16, nxv8f16, int_aarch64_sme_fmla_single_vg1x4>;
+defm FMLA_VG2_M2Z4Z_H : sme2_dot_mla_add_sub_array_vg2_multi<"fmla", 0b0100001, MatrixOp16, ZZ_h_mul_r, nxv8f16, int_aarch64_sme_fmla_vg1x2>;
+defm FMLA_VG4_M4Z4Z_H : sme2_dot_mla_add_sub_array_vg4_multi<"fmla", 0b0100001, MatrixOp16, ZZZZ_h_mul_r, nxv8f16, int_aarch64_sme_fmla_vg1x4>;
+
+defm FMLS_VG2_M2ZZI_H : sme2p1_multi_vec_array_vg2_index_16b<"fmls", 0b00, 0b101, ZZ_h_mul_r, ZPR4b16, nxv8f16, int_aarch64_sme_fmls_lane_vg1x2>;
+defm FMLS_VG4_M4ZZI_H : sme2p1_multi_vec_array_vg4_index_16b<"fmls", 0b001, ZZZZ_h_mul_r, ZPR4b16, nxv8f16, int_aarch64_sme_fmls_lane_vg1x4>;
+defm FMLS_VG2_M2ZZ_H : sme2_dot_mla_add_sub_array_vg2_single<"fmls", 0b0011101, MatrixOp16, ZZ_h, ZPR4b16, nxv8f16, int_aarch64_sme_fmls_single_vg1x2>;
+defm FMLS_VG4_M4ZZ_H : sme2_dot_mla_add_sub_array_vg4_single<"fmls", 0b0111101, MatrixOp16, ZZZZ_h, ZPR4b16, nxv8f16, int_aarch64_sme_fmls_single_vg1x4>;
+defm FMLS_VG2_M2Z2Z_H : sme2_dot_mla_add_sub_array_vg2_multi<"fmls", 0b0100011, MatrixOp16, ZZ_h_mul_r, nxv8f16, int_aarch64_sme_fmls_vg1x2>;
+defm FMLS_VG4_M4Z2Z_H : sme2_dot_mla_add_sub_array_vg4_multi<"fmls", 0b0100011, MatrixOp16, ZZZZ_h_mul_r, nxv8f16, int_aarch64_sme_fmls_vg1x4>;
defm FCVT_2ZZ_H : sme2p1_fp_cvt_vector_vg2_single<"fcvt", 0b0>;
defm FCVTL_2ZZ_H : sme2p1_fp_cvt_vector_vg2_single<"fcvtl", 0b1>;
@@ -827,20 +825,19 @@ defm BFADD_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"bfadd", 0b1100, MatrixOp
defm BFSUB_VG2_M2Z_H : sme2_multivec_accum_add_sub_vg2<"bfsub", 0b1101, MatrixOp16, ZZ_h_mul_r, nxv8bf16, null_frag>;
defm BFSUB_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"bfsub", 0b1101, MatrixOp16, ZZZZ_h_mul_r, nxv8bf16, null_frag>;
-defm BFMLA_VG2_M2ZZI : sme2p1_multi_vec_array_vg2_index_16b<"bfmla", 0b00, 0b110, ZZ_h_mul_r, ZPR4b16>;
-defm BFMLA_VG4_M4ZZI : sme2p1_multi_vec_array_vg4_index_16b<"bfmla", 0b010, ZZZZ_h_mul_r, ZPR4b16>;
-defm BFMLA_VG2_M2ZZ : sme2_dot_mla_add_sub_array_vg24_single<"bfmla", 0b1011100, MatrixOp16, ZZ_h, ZPR4b16>;
-defm BFMLA_VG4_M4ZZ : sme2_dot_mla_add_sub_array_vg24_single<"bfmla", 0b1111100, MatrixOp16, ZZZZ_h, ZPR4b16>;
-defm BFMLA_VG2_M2Z2Z : sme2_dot_mla_add_sub_array_vg2_multi<"bfmla", 0b1100001, MatrixOp16, ZZ_h_mul_r, nxv8bf16, null_frag>;
-defm BFMLA_VG4_M4Z4Z : sme2_dot_mla_add_sub_array_vg4_multi<"bfmla", 0b1100001, MatrixOp16, ZZZZ_h_mul_r, nxv8bf16, null_frag>;
-
-defm BFMLS_VG2_M2ZZI : sme2p1_multi_vec_array_vg2_index_16b<"bfmls", 0b00, 0b111, ZZ_h_mul_r, ZPR4b16>;
-defm BFMLS_VG4_M4ZZI : sme2p1_multi_vec_array_vg4_index_16b<"bfmls", 0b011, ZZZZ_h_mul_r, ZPR4b16>;
-defm BFMLS_VG2_M2ZZ : sme2_dot_mla_add_sub_array_vg24_single<"bfmls", 0b1011101, MatrixOp16, ZZ_h, ZPR4b16>;
-defm BFMLS_VG4_M4ZZ : sme2_dot_mla_add_sub_array_vg24_single<"bfmls", 0b1111101, MatrixOp16, ZZZZ_h, ZPR4b16>;
-defm BFMLS_VG2_M2Z2Z : sme2_dot_mla_add_sub_array_vg2_multi<"bfmls", 0b1100011, MatrixOp16, ZZ_h_mul_r, nxv8bf16, null_frag>;
-defm BFMLS_VG4_M4Z4Z : sme2_dot_mla_add_sub_array_vg4_multi<"bfmls", 0b1100011, MatrixOp16, ZZZZ_h_mul_r, nxv8bf16, null_frag>;
+defm BFMLA_VG2_M2ZZI : sme2p1_multi_vec_array_vg2_index_16b<"bfmla", 0b00, 0b110, ZZ_h_mul_r, ZPR4b16, nxv8bf16, int_aarch64_sme_fmla_lane_vg1x2>;
+defm BFMLA_VG4_M4ZZI : sme2p1_multi_vec_array_vg4_index_16b<"bfmla", 0b010, ZZZZ_h_mul_r, ZPR4b16, nxv8bf16, int_aarch64_sme_fmla_lane_vg1x4>;
+defm BFMLA_VG2_M2ZZ : sme2_dot_mla_add_sub_array_vg2_single<"bfmla", 0b1011100, MatrixOp16, ZZ_h, ZPR4b16, nxv8bf16, int_aarch64_sme_fmla_single_vg1x2>;
+defm BFMLA_VG4_M4ZZ : sme2_dot_mla_add_sub_array_vg4_single<"bfmla", 0b1111100, MatrixOp16, ZZZZ_h, ZPR4b16, nxv8bf16, int_aarch64_sme_fmla_single_vg1x4>;
+defm BFMLA_VG2_M2Z2Z : sme2_dot_mla_add_sub_array_vg2_multi<"bfmla", 0b1100001, MatrixOp16, ZZ_h_mul_r, nxv8bf16, int_aarch64_sme_fmla_vg1x2>;
+defm BFMLA_VG4_M4Z4Z : sme2_dot_mla_add_sub_array_vg4_multi<"bfmla", 0b1100001, MatrixOp16, ZZZZ_h_mul_r, nxv8bf16, int_aarch64_sme_fmla_vg1x4>;
+defm BFMLS_VG2_M2ZZI : sme2p1_multi_vec_array_vg2_index_16b<"bfmls", 0b00, 0b111, ZZ_h_mul_r, ZPR4b16, nxv8bf16, int_aarch64_sme_fmls_lane_vg1x2>;
+defm BFMLS_VG4_M4ZZI : sme2p1_multi_vec_array_vg4_index_16b<"bfmls", 0b011, ZZZZ_h_mul_r, ZPR4b16, nxv8bf16, int_aarch64_sme_fmls_lane_vg1x4>;
+defm BFMLS_VG2_M2ZZ : sme2_dot_mla_add_sub_array_vg2_single<"bfmls", 0b1011101, MatrixOp16, ZZ_h, ZPR4b16, nxv8bf16, int_aarch64_sme_fmls_single_vg1x2>;
+defm BFMLS_VG4_M4ZZ : sme2_dot_mla_add_sub_array_vg4_single<"bfmls", 0b1111101, MatrixOp16, ZZZZ_h, ZPR4b16, nxv8bf16, int_aarch64_sme_fmls_single_vg1x4>;
+defm BFMLS_VG2_M2Z2Z : sme2_dot_mla_add_sub_array_vg2_multi<"bfmls", 0b1100011, MatrixOp16, ZZ_h_mul_r, nxv8bf16, int_aarch64_sme_fmls_vg1x2>;
+defm BFMLS_VG4_M4Z4Z : sme2_dot_mla_add_sub_array_vg4_multi<"bfmls", 0b1100011, MatrixOp16, ZZZZ_h_mul_r, nxv8bf16, int_aarch64_sme_fmls_vg1x4>;
defm BFMAX_VG2_2ZZ : sme2p1_bf_max_min_vector_vg2_single<"bfmax", 0b0010000>;
defm BFMAX_VG4_4ZZ : sme2p1_bf_max_min_vector_vg4_single<"bfmax", 0b0010000>;
@@ -909,9 +906,9 @@ def LUTI4_S_4ZZT2Z : sme2_luti4_vector_vg4_strided<0b00, 0b00, "luti4">;
} //[HasSME2p1, HasSME_LUTv2]
let Predicates = [HasSMEF8F16] in {
-defm FVDOT_VG2_M2ZZI_BtoH : sme2p1_multi_vec_array_vg2_index_16b<"fvdot", 0b11, 0b110, ZZ_b_mul_r, ZPR4b8>;
-defm FDOT_VG2_M2ZZI_BtoH : sme2p1_multi_vec_array_vg2_index_16b<"fdot", 0b11, 0b010, ZZ_b_mul_r, ZPR4b8>;
-defm FDOT_VG4_M4ZZI_BtoH : sme2p1_multi_vec_array_vg4_index_16b<"fdot", 0b100, ZZZZ_b_mul_r, ZPR4b8>;
+defm FVDOT_VG2_M2ZZI_BtoH : sme2p1_multi_vec_array_vg2_index_f8f16<"fvdot", 0b11, 0b110, ZZ_b_mul_r, ZPR4b8>;
+defm FDOT_VG2_M2ZZI_BtoH : sme2p1_multi_vec_array_vg2_index_f8f16<"fdot", 0b11, 0b010, ZZ_b_mul_r, ZPR4b8>;
+defm FDOT_VG4_M4ZZI_BtoH : sme2p1_multi_vec_array_vg4_index_f8f16<"fdot", 0b100, ZZZZ_b_mul_r, ZPR4b8>;
defm FDOT_VG2_M2ZZ_BtoH : sme2_dot_mla_add_sub_array_vg24_single<"fdot", 0b0010001, MatrixOp16, ZZ_b, ZPR4b8>;
defm FDOT_VG4_M4ZZ_BtoH : sme2_dot_mla_add_sub_array_vg24_single<"fdot", 0b0110001, MatrixOp16, ZZZZ_b, ZPR4b8>;
// TODO: Replace nxv16i8 by nxv16f8
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index 3363aab4b093c..724dd07225cde 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -2448,9 +2448,29 @@ multiclass sme2_multi_vec_array_vg2_index_32b<string mnemonic, bits<2> sz, bits<
}
// SME2.1 multi-vec ternary indexed two registers 16-bit
-// SME2 multi-vec indexed FP8 two-way dot product to FP16 two registers
multiclass sme2p1_multi_vec_array_vg2_index_16b<string mnemonic, bits<2> sz, bits<3> op,
- RegisterOperand multi_vector_ty, ZPRRegOp zpr_ty> {
+ RegisterOperand multi_vector_ty, ZPRRegOp vector_ty,
+ ValueType vt, SDPatternOperator intrinsic> {
+ def NAME : sme2_multi_vec_array_vg2_index<sz, {op{2},?,?,op{1-0},?}, MatrixOp16,
+ multi_vector_ty, vector_ty,
+ VectorIndexH, mnemonic>, SMEPseudo2Instr<NAME, 1> {
+ bits<3> i;
+ let Inst{11-10} = i{2-1};
+ let Inst{3} = i{0};
+ }
+
+ def _PSEUDO : sme2_za_array_2op_multi_index_pseudo<NAME, sme_elm_idx0_7, multi_vector_ty, vector_ty, VectorIndexH32b, SMEMatrixArray>;
+
+ def : SME2_ZA_TwoOp_VG2_Multi_Index_Pat<NAME, intrinsic, sme_elm_idx0_7, vector_ty, vt, VectorIndexH32b_timm, tileslice16>;
+
+ def : InstAlias<mnemonic # "\t$ZAda[$Rv, $imm3], $Zn, $Zm$i",
+ (!cast<Instruction>(NAME) MatrixOp16:$ZAda, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3,
+ multi_vector_ty:$Zn, vector_ty:$Zm, VectorIndexH:$i), 0>;
+}
+
+// SME2 multi-vec indexed FP8 two-way dot product to FP16 two registers
+multiclass sme2p1_multi_vec_array_vg2_index_f8f16<string mnemonic, bits<2> sz, bits<3> op,
+ RegisterOperand multi_vector_ty, ZPRRegOp zpr_ty> {
def NAME : sme2_multi_vec_array_vg2_index<sz, {op{2},?,?,op{1-0},?}, MatrixOp16,
multi_vector_ty, zpr_ty,
VectorIndexH, mnemonic> {
@@ -2569,10 +2589,10 @@ multiclass sme2_multi_vec_array_vg4_index_32b<string mnemonic, bits<4> op,
multi_vector_ty:$Zn, vector_ty:$Zm, VectorIndexS32b_timm:$i), 0>;
}
-// SME2.1 multi-vec ternary indexed four registers 16-bit
-multiclass sme2p1_multi_vec_array_vg4_index_16b<string mnemonic, bits<3> op,
- RegisterOperand multi_vector_ty,
- ZPRRegOp zpr_ty> {
+// SME2.1 multi-vec ternary indexed four registers 16-bit (FP8)
+multiclass sme2p1_multi_vec_array_vg4_index_f8f16<string mnemonic, bits<3> op,
+ RegisterOperand multi_vector_ty,
+ ZPRRegOp zpr_ty> {
def NAME : sme2_multi_vec_array_vg4_index<0b0,{0b1,?,?,op,?}, MatrixOp16,
multi_vector_ty, zpr_ty,
VectorIndexH, mnemonic>{
@@ -2586,6 +2606,28 @@ multiclass sme2p1_multi_vec_array_vg4_index_16b<string mnemonic, bits<3> op,
sme_elm_idx0_7:$imm3, multi_vector_ty:$Zn, zpr_ty:$Zm, VectorIndexH:$i), 0>;
}
+// SME2.1 multi-vec ternary indexed four registers 16-bit
+multiclass sme2p1_multi_vec_array_vg4_index_16b<string mnemonic, bits<3> op,
+ RegisterOperand multi_vector_ty,
+ ZPRRegOp vector_ty, ValueType vt,
+ SDPatternOperator intrinsic> {
+ def NAME : sme2_multi_vec_array_vg4_index<0b0,{0b1,?,?,op,?}, MatrixOp16,
+ multi_vector_ty, vector_ty,
+ VectorIndexH, mnemonic>, SMEPseudo2Instr<NAME, 1> {
+ bits<3> i;
+ let Inst{11-10} = i{2-1};
+ let Inst{3} = i{0};
+ }
+
+ def _PSEUDO : sme2_za_array_2op_multi_index_pseudo<NAME, sme_elm_idx0_7, multi_vector_ty, vector_ty, VectorIndexH32b_timm, SMEMatrixArray>;
+
+ def : SME2_ZA_TwoOp_VG4_Multi_Index_Pat<NAME, intrinsic, sme_elm_idx0_7, vector_ty, vt, VectorIndexH32b_timm, tileslice16>;
+
+ def : InstAlias<mnemonic # "\t$ZAda[$Rv, $imm3], $Zn, $Zm$i",
+ (!cast<Instruction>(NAME) MatrixOp16:$ZAda, MatrixIndexGPR32Op8_11:$Rv,
+ sme_elm_idx0_7:$imm3, multi_vector_ty:$Zn, vector_ty:$Zm, VectorIndexH:$i), 0>;
+}
+
// SME2 multi-vec ternary indexed four registers 64-bit
class sme2_multi_vec_array_vg4_index_64b<bits<3> op,
RegisterOperand multi_vector_ty,
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-fmlas16.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-fmlas16.ll
new file mode 100644
index 0000000000000..3e807b7e63384
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-fmlas16.ll
@@ -0,0 +1,462 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --filter-out "// kill:.*$" --version 4
+; RUN: llc -verify-machineinstrs < %s | FileCheck %s
+
+target triple = "aarch64-linux"
+
+define void @test_fmla_f16_vg2_single(i32 %slice, <vscale x 8 x half> %a0, <vscale x 8 x half> %a1, <vscale x 8 x half> %b) #0 {
+; CHECK-LABEL: test_fmla_f16_vg2_single:
+; CHECK: // %bb.0:
+; CHECK: mov w8, w0
+; CHECK: fmla za.h[w8, 0, vgx2], { z0.h, z1.h }, z2.h
+; CHECK: fmla za.h[w8, 7, vgx2], { z0.h, z1.h }, z2.h
+; CHECK: ret
+ call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv8f16(i32 %slice, <vscale x 8 x half> %a0, <vscale x 8 x half> %a1, <vscale x 8 x half> %b)
+ %slice.7 = add i32 %slice, 7
+ call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv8f16(i32 %slice.7, <vscale x 8 x half> %a0, <vscale x 8 x half> %a1, <vscale x 8 x half> %b)
+ ret void
+}
+
+define void @test_fmla_f16_vg4_single(i32 %slice, <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+; CHECK-LABEL: test_fmla_f16_vg4_single:
+; CHECK: // %bb.0:
+; CHECK: mov w8, w0
+; CHECK: fmla za.h[w8, 0, vgx4], { z0.h - z3.h }, z4.h
+; CHECK: fmla za.h[w8, 7, vgx4], { z0.h - z3.h }, z4.h
+; CHECK: ret
+ <vscale x 8 x half> %a2, <vscale x 8 x half> %a3, <vscale x 8 x half> %b) #0 {
+ call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv8f16(i32 %slice, <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+ <vscale x 8 x half> %a2, <vscale x 8 x half> %a3, <vscale x 8 x half> %b)
+ %slice.7 = add i32 %slice, 7
+ call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv8f16(i32 %slice.7, <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+ <vscale x 8 x half> %a2, <vscale x 8 x half> %a3, <vscale x 8 x half> %b)
+ ret void
+}
+
+define void @test_fmls_f16_vg2_single(i32 %slice, <vscale x 8 x half> %a0, <vscale x 8 x half> %a1, <vscale x 8 x half> %b) #0 {
+; CHECK-LABEL: test_fmls_f16_vg2_single:
+; CHECK: // %bb.0:
+; CHECK: mov w8, w0
+; CHECK: fmls za.h[w8, 0, vgx2], { z0.h, z1.h }, z2.h
+; CHECK: fmls za.h[w8, 7, vgx2], { z0.h, z1.h }, z2.h
+; CHECK: ret
+ call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv8f16(i32 %slice, <vscale x 8 x half> %a0, <vscale x 8 x half> %a1, <vscale x 8 x half> %b)
+ %slice.7 = add i32 %slice, 7
+ call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv8f16(i32 %slice.7, <vscale x 8 x half> %a0, <vscale x 8 x half> %a1, <vscale x 8 x half> %b)
+ ret void
+}
+
+define void @test_fmls_f16_vg4_single(i32 %slice, <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+; CHECK-LABEL: test_fmls_f16_vg4_single:
+; CHECK: // %bb.0:
+; CHECK: mov w8, w0
+; CHECK: fmls za.h[w8, 0, vgx4], { z0.h - z3.h }, z4.h
+; CHECK: fmls za.h[w8, 7, vgx4], { z0.h - z3.h }, z4.h
+; CHECK: ret
+ <vscale x 8 x half> %a2, <vscale x 8 x half> %a3, <vscale x 8 x half> %b) #0 {
+ call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv8f16(i32 %slice, <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+ <vscale x 8 x half> %a2, <vscale x 8 x half> %a3, <vscale x 8 x half> %b)
+ %slice.7 = add i32 %slice, 7
+ call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv8f16(i32 %slice.7, <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+ <vscale x 8 x half> %a2, <vscale x 8 x half> %a3, <vscale x 8 x half> %b)
+ ret void
+}
+
+define void @test_fmla_f16_vg2_multi(i32 %slice,
+; CHECK-LABEL: test_fmla_f16_vg2_multi:
+; CHECK: // %bb.0:
+; CHECK: mov w8, w0
+; CHECK: fmla za.h[w8, 0, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
+; CHECK: fmla za.h[w8, 7, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
+; CHECK: ret
+ <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+ <vscale x 8 x half> %b0, <vscale x 8 x half> %b1) #0 {
+ call void @llvm.aarch64.sme.fmla.vg1x2.nxv8f16(i32 %slice,
+ <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+ <vscale x 8 x half> %b0, <vscale x 8 x half> %b1)
+ %slice.7 = add i32 %slice, 7
+ call void @llvm.aarch64.sme.fmla.vg1x2.nxv8f16(i32 %slice.7,
+ <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+ <vscale x 8 x half> %b0, <vscale x 8 x half> %b1)
+ ret void
+}
+
+define void @test_fmla_f16_vg4_multi(i32 %slice,
+; CHECK-LABEL: test_fmla_f16_vg4_multi:
+; CHECK: // %bb.0:
+; CHECK: mov w8, w0
+; CHECK: fmla za.h[w8, 0, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
+; CHECK: fmla za.h[w8, 7, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
+; CHECK: ret
+ <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+ <vscale x 8 x half> %a2, <vscale x 8 x half> %a3,
+ <vscale x 8 x half> %b0, <vscale x 8 x half> %b1,
+ <vscale x 8 x half> %b2, <vscale x 8 x half> %b3) #0 {
+ call void @llvm.aarch64.sme.fmla.vg1x4.nxv8f16(i32 %slice,
+ <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+ <vscale x 8 x half> %a2, <vscale x 8 x half> %a3,
+ <vscale x 8 x half> %b0, <vscale x 8 x half> %b1,
+ <vscale x 8 x half> %b2, <vscale x 8 x half> %b3)
+ %slice.7 = add i32 %slice, 7
+ call void @llvm.aarch64.sme.fmla.vg1x4.nxv8f16(i32 %slice.7,
+ <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+ <vscale x 8 x half> %a2, <vscale x 8 x half> %a3,
+ <vscale x 8 x half> %b0, <vscale x 8 x half> %b1,
+ <vscale x 8 x half> %b2, <vscale x 8 x half> %b3)
+ ret void
+}
+
+define void @test_fmls_f16_vg2_multi(i32 %slice,
+; CHECK-LABEL: test_fmls_f16_vg2_multi:
+; CHECK: // %bb.0:
+; CHECK: mov w8, w0
+; CHECK: fmls za.h[w8, 0, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
+; CHECK: fmls za.h[w8, 7, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
+; CHECK: ret
+ <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+ <vscale x 8 x half> %b0, <vscale x 8 x half> %b1) #0 {
+ call void @llvm.aarch64.sme.fmls.vg1x2.nxv8f16(i32 %slice,
+ <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+ <vscale x 8 x half> %b0, <vscale x 8 x half> %b1)
+ %slice.7 = add i32 %slice, 7
+ call void @llvm.aarch64.sme.fmls.vg1x2.nxv8f16(i32 %slice.7,
+ <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+ <vscale x 8 x half> %b0, <vscale x 8 x half> %b1)
+ ret void
+}
+
+define void @test_fmls_f16_vg4_multi(i32 %slice,
+; CHECK-LABEL: test_fmls_f16_vg4_multi:
+; CHECK: // %bb.0:
+; CHECK: mov w8, w0
+; CHECK: fmls za.h[w8, 0, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
+; CHECK: fmls za.h[w8, 7, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
+; CHECK: ret
+ <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+ <vscale x 8 x half> %a2, <vscale x 8 x half> %a3,
+ <vscale x 8 x half> %b0, <vscale x 8 x half> %b1,
+ <vscale x 8 x half> %b2, <vscale x 8 x half> %b3) #0 {
+ call void @llvm.aarch64.sme.fmls.vg1x4.nxv8f16(i32 %slice,
+ <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+ <vscale x 8 x half> %a2, <vscale x 8 x half> %a3,
+ <vscale x 8 x half> %b0, <vscale x 8 x half> %b1,
+ <vscale x 8 x half> %b2, <vscale x 8 x half> %b3)
+ %slice.7 = add i32 %slice, 7
+ call void @llvm.aarch64.sme.fmls.vg1x4.nxv8f16(i32 %slice.7,
+ <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+ <vscale x 8 x half> %a2, <vscale x 8 x half> %a3,
+ <vscale x 8 x half> %b0, <vscale x 8 x half> %b1,
+ <vscale x 8 x half> %b2, <vscale x 8 x half> %b3)
+ ret void
+}
+
+define void @test_fmla_f16_vg2_index(i32 %slice,
+; CHECK-LABEL: test_fmla_f16_vg2_index:
+; CHECK: // %bb.0:
+; CHECK: mov w8, w0
+; CHECK: fmla za.h[w8, 0, vgx2], { z0.h, z1.h }, z2.h[7]
+; CHECK: fmla za.h[w8, 7, vgx2], { z0.h, z1.h }, z2.h[7]
+; CHECK: ret
+ <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+ <vscale x 8 x half> %b) #0 {
+ call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv8f16(i32 %slice,
+ <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+ <vscale x 8 x half> %b, i32 7);
+ %slice.7 = add i32 %slice, 7
+ call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv8f16(i32 %slice.7,
+ <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+ <vscale x 8 x half> %b, i32 7);
+ ret void
+}
+
+define void @test_fmla_f16_vg4_index(i32 %slice,
+; CHECK-LABEL: test_fmla_f16_vg4_index:
+; CHECK: // %bb.0:
+; CHECK: mov w8, w0
+; CHECK: fmla za.h[w8, 0, vgx4], { z0.h - z3.h }, z4.h[7]
+; CHECK: fmla za.h[w8, 0, vgx4], { z0.h - z3.h }, z4.h[7]
+; CHECK: ret
+ <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+ <vscale x 8 x half> %a2, <vscale x 8 x half> %a3,
+ <vscale x 8 x half> %b) #0 {
+ call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv8f16(i32 %slice,
+ <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+ <vscale x 8 x half> %a2, <vscale x 8 x half> %a3,
+ <vscale x 8 x half> %b, i32 7);
+ %slice.7 = add i32 %slice, 7
+ call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv8f16(i32 %slice,
+ <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+ <vscale x 8 x half> %a2, <vscale x 8 x half> %a3,
+ <vscale x 8 x half> %b, i32 7);
+ ret void
+}
+
+define void @test_fmls_f16_vg2_index(i32 %slice,
+; CHECK-LABEL: test_fmls_f16_vg2_index:
+; CHECK: // %bb.0:
+; CHECK: mov w8, w0
+; CHECK: fmls za.h[w8, 0, vgx2], { z0.h, z1.h }, z2.h[7]
+; CHECK: fmls za.h[w8, 7, vgx2], { z0.h, z1.h }, z2.h[7]
+; CHECK: ret
+ <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+ <vscale x 8 x half> %b) #0 {
+ call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv8f16(i32 %slice,
+ <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+ <vscale x 8 x half> %b, i32 7);
+ %slice.7 = add i32 %slice, 7
+ call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv8f16(i32 %slice.7,
+ <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+ <vscale x 8 x half> %b, i32 7);
+ ret void
+}
+
+define void @test_fmls_f16_vg4_index(i32 %slice,
+; CHECK-LABEL: test_fmls_f16_vg4_index:
+; CHECK: // %bb.0:
+; CHECK: mov w8, w0
+; CHECK: fmls za.h[w8, 0, vgx4], { z0.h - z3.h }, z4.h[7]
+; CHECK: fmls za.h[w8, 0, vgx4], { z0.h - z3.h }, z4.h[7]
+; CHECK: ret
+ <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+ <vscale x 8 x half> %a2, <vscale x 8 x half> %a3,
+ <vscale x 8 x half> %b) #0 {
+ call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv8f16(i32 %slice,
+ <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+ <vscale x 8 x half> %a2, <vscale x 8 x half> %a3,
+ <vscale x 8 x half> %b, i32 7);
+ %slice.7 = add i32 %slice, 7
+ call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv8f16(i32 %slice,
+ <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+ <vscale x 8 x half> %a2, <vscale x 8 x half> %a3,
+ <vscale x 8 x half> %b, i32 7);
+ ret void
+}
+
+define void @test_fmla_bf16_vg2_single(i32 %slice, <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1, <vscale x 8 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fmla_bf16_vg2_single:
+; CHECK: // %bb.0:
+; CHECK: mov w8, w0
+; CHECK: bfmla za.h[w8, 0, vgx2], { z0.h, z1.h }, z2.h
+; CHECK: bfmla za.h[w8, 7, vgx2], { z0.h, z1.h }, z2.h
+; CHECK: ret
+ call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1, <vscale x 8 x bfloat> %b)
+ %slice.7 = add i32 %slice, 7
+ call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv8bf16(i32 %slice.7, <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1, <vscale x 8 x bfloat> %b)
+ ret void
+}
+
+define void @test_fmla_bf16_vg4_single(i32 %slice, <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+; CHECK-LABEL: test_fmla_bf16_vg4_single:
+; CHECK: // %bb.0:
+; CHECK: mov w8, w0
+; CHECK: bfmla za.h[w8, 0, vgx4], { z0.h - z3.h }, z4.h
+; CHECK: bfmla za.h[w8, 7, vgx4], { z0.h - z3.h }, z4.h
+; CHECK: ret
+ <vscale x 8 x bfloat> %a2, <vscale x 8 x bfloat> %a3, <vscale x 8 x bfloat> %b) #0 {
+ call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+ <vscale x 8 x bfloat> %a2, <vscale x 8 x bfloat> %a3, <vscale x 8 x bfloat> %b)
+ %slice.7 = add i32 %slice, 7
+ call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv8bf16(i32 %slice.7, <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+ <vscale x 8 x bfloat> %a2, <vscale x 8 x bfloat> %a3, <vscale x 8 x bfloat> %b)
+ ret void
+}
+
+define void @test_fmls_bf16_vg2_single(i32 %slice, <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1, <vscale x 8 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fmls_bf16_vg2_single:
+; CHECK: // %bb.0:
+; CHECK: mov w8, w0
+; CHECK: bfmls za.h[w8, 0, vgx2], { z0.h, z1.h }, z2.h
+; CHECK: bfmls za.h[w8, 7, vgx2], { z0.h, z1.h }, z2.h
+; CHECK: ret
+ call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1, <vscale x 8 x bfloat> %b)
+ %slice.7 = add i32 %slice, 7
+ call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv8bf16(i32 %slice.7, <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1, <vscale x 8 x bfloat> %b)
+ ret void
+}
+
+define void @test_fmls_bf16_vg4_single(i32 %slice, <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+; CHECK-LABEL: test_fmls_bf16_vg4_single:
+; CHECK: // %bb.0:
+; CHECK: mov w8, w0
+; CHECK: bfmls za.h[w8, 0, vgx4], { z0.h - z3.h }, z4.h
+; CHECK: bfmls za.h[w8, 7, vgx4], { z0.h - z3.h }, z4.h
+; CHECK: ret
+ <vscale x 8 x bfloat> %a2, <vscale x 8 x bfloat> %a3, <vscale x 8 x bfloat> %b) #0 {
+ call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+ <vscale x 8 x bfloat> %a2, <vscale x 8 x bfloat> %a3, <vscale x 8 x bfloat> %b)
+ %slice.7 = add i32 %slice, 7
+ call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv8bf16(i32 %slice.7, <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+ <vscale x 8 x bfloat> %a2, <vscale x 8 x bfloat> %a3, <vscale x 8 x bfloat> %b)
+ ret void
+}
+
+define void @test_fmla_bf16_vg2_multi(i32 %slice,
+; CHECK-LABEL: test_fmla_bf16_vg2_multi:
+; CHECK: // %bb.0:
+; CHECK: mov w8, w0
+; CHECK: bfmla za.h[w8, 0, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
+; CHECK: bfmla za.h[w8, 7, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
+; CHECK: ret
+ <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+ <vscale x 8 x bfloat> %b0, <vscale x 8 x bfloat> %b1) #0 {
+ call void @llvm.aarch64.sme.fmla.vg1x2.nxv8bf16(i32 %slice,
+ <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+ <vscale x 8 x bfloat> %b0, <vscale x 8 x bfloat> %b1)
+ %slice.7 = add i32 %slice, 7
+ call void @llvm.aarch64.sme.fmla.vg1x2.nxv8bf16(i32 %slice.7,
+ <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+ <vscale x 8 x bfloat> %b0, <vscale x 8 x bfloat> %b1)
+ ret void
+}
+
+define void @test_fmla_bf16_vg4_multi(i32 %slice,
+; CHECK-LABEL: test_fmla_bf16_vg4_multi:
+; CHECK: // %bb.0:
+; CHECK: mov w8, w0
+; CHECK: bfmla za.h[w8, 0, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
+; CHECK: bfmla za.h[w8, 7, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
+; CHECK: ret
+ <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+ <vscale x 8 x bfloat> %a2, <vscale x 8 x bfloat> %a3,
+ <vscale x 8 x bfloat> %b0, <vscale x 8 x bfloat> %b1,
+ <vscale x 8 x bfloat> %b2, <vscale x 8 x bfloat> %b3) #0 {
+ call void @llvm.aarch64.sme.fmla.vg1x4.nxv8bf16(i32 %slice,
+ <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+ <vscale x 8 x bfloat> %a2, <vscale x 8 x bfloat> %a3,
+ <vscale x 8 x bfloat> %b0, <vscale x 8 x bfloat> %b1,
+ <vscale x 8 x bfloat> %b2, <vscale x 8 x bfloat> %b3)
+ %slice.7 = add i32 %slice, 7
+ call void @llvm.aarch64.sme.fmla.vg1x4.nxv8bf16(i32 %slice.7,
+ <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+ <vscale x 8 x bfloat> %a2, <vscale x 8 x bfloat> %a3,
+ <vscale x 8 x bfloat> %b0, <vscale x 8 x bfloat> %b1,
+ <vscale x 8 x bfloat> %b2, <vscale x 8 x bfloat> %b3)
+ ret void
+}
+
+define void @test_fmls_bf16_vg2_multi(i32 %slice,
+; CHECK-LABEL: test_fmls_bf16_vg2_multi:
+; CHECK: // %bb.0:
+; CHECK: mov w8, w0
+; CHECK: bfmls za.h[w8, 0, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
+; CHECK: bfmls za.h[w8, 7, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
+; CHECK: ret
+ <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+ <vscale x 8 x bfloat> %b0, <vscale x 8 x bfloat> %b1) #0 {
+ call void @llvm.aarch64.sme.fmls.vg1x2.nxv8bf16(i32 %slice,
+ <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+ <vscale x 8 x bfloat> %b0, <vscale x 8 x bfloat> %b1)
+ %slice.7 = add i32 %slice, 7
+ call void @llvm.aarch64.sme.fmls.vg1x2.nxv8bf16(i32 %slice.7,
+ <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+ <vscale x 8 x bfloat> %b0, <vscale x 8 x bfloat> %b1)
+ ret void
+}
+
+define void @test_fmls_bf16_vg4_multi(i32 %slice,
+; CHECK-LABEL: test_fmls_bf16_vg4_multi:
+; CHECK: // %bb.0:
+; CHECK: mov w8, w0
+; CHECK: bfmls za.h[w8, 0, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
+; CHECK: bfmls za.h[w8, 7, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
+; CHECK: ret
+ <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+ <vscale x 8 x bfloat> %a2, <vscale x 8 x bfloat> %a3,
+ <vscale x 8 x bfloat> %b0, <vscale x 8 x bfloat> %b1,
+ <vscale x 8 x bfloat> %b2, <vscale x 8 x bfloat> %b3) #0 {
+ call void @llvm.aarch64.sme.fmls.vg1x4.nxv8bf16(i32 %slice,
+ <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+ <vscale x 8 x bfloat> %a2, <vscale x 8 x bfloat> %a3,
+ <vscale x 8 x bfloat> %b0, <vscale x 8 x bfloat> %b1,
+ <vscale x 8 x bfloat> %b2, <vscale x 8 x bfloat> %b3)
+ %slice.7 = add i32 %slice, 7
+ call void @llvm.aarch64.sme.fmls.vg1x4.nxv8bf16(i32 %slice.7,
+ <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+ <vscale x 8 x bfloat> %a2, <vscale x 8 x bfloat> %a3,
+ <vscale x 8 x bfloat> %b0, <vscale x 8 x bfloat> %b1,
+ <vscale x 8 x bfloat> %b2, <vscale x 8 x bfloat> %b3)
+ ret void
+}
+
+define void @test_fmla_bf16_vg2_index(i32 %slice,
+; CHECK-LABEL: test_fmla_bf16_vg2_index:
+; CHECK: // %bb.0:
+; CHECK: mov w8, w0
+; CHECK: bfmla za.h[w8, 0, vgx2], { z0.h, z1.h }, z2.h[7]
+; CHECK: bfmla za.h[w8, 7, vgx2], { z0.h, z1.h }, z2.h[7]
+; CHECK: ret
+ <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+ <vscale x 8 x bfloat> %b) #0 {
+ call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv8bf16(i32 %slice,
+ <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+ <vscale x 8 x bfloat> %b, i32 7);
+ %slice.7 = add i32 %slice, 7
+ call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv8bf16(i32 %slice.7,
+ <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+ <vscale x 8 x bfloat> %b, i32 7);
+ ret void
+}
+
+define void @test_fmla_bf16_vg4_index(i32 %slice,
+; CHECK-LABEL: test_fmla_bf16_vg4_index:
+; CHECK: // %bb.0:
+; CHECK: mov w8, w0
+; CHECK: bfmla za.h[w8, 0, vgx4], { z0.h - z3.h }, z4.h[7]
+; CHECK: bfmla za.h[w8, 0, vgx4], { z0.h - z3.h }, z4.h[7]
+; CHECK: ret
+ <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+ <vscale x 8 x bfloat> %a2, <vscale x 8 x bfloat> %a3,
+ <vscale x 8 x bfloat> %b) #0 {
+ call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv8bf16(i32 %slice,
+ <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+ <vscale x 8 x bfloat> %a2, <vscale x 8 x bfloat> %a3,
+ <vscale x 8 x bfloat> %b, i32 7);
+ %slice.7 = add i32 %slice, 7
+ call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv8bf16(i32 %slice,
+ <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+ <vscale x 8 x bfloat> %a2, <vscale x 8 x bfloat> %a3,
+ <vscale x 8 x bfloat> %b, i32 7);
+ ret void
+}
+
+define void @test_fmls_bf16_vg2_index(i32 %slice,
+; CHECK-LABEL: test_fmls_bf16_vg2_index:
+; CHECK: // %bb.0:
+; CHECK: mov w8, w0
+; CHECK: bfmls za.h[w8, 0, vgx2], { z0.h, z1.h }, z2.h[7]
+; CHECK: bfmls za.h[w8, 7, vgx2], { z0.h, z1.h }, z2.h[7]
+; CHECK: ret
+ <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+ <vscale x 8 x bfloat> %b) #0 {
+ call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv8bf16(i32 %slice,
+ <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+ <vscale x 8 x bfloat> %b, i32 7);
+ %slice.7 = add i32 %slice, 7
+ call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv8bf16(i32 %slice.7,
+ <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+ <vscale x 8 x bfloat> %b, i32 7);
+ ret void
+}
+
+define void @test_fmls_bf16_vg4_index(i32 %slice,
+; CHECK-LABEL: test_fmls_bf16_vg4_index:
+; CHECK: // %bb.0:
+; CHECK: mov w8, w0
+; CHECK: bfmls za.h[w8, 0, vgx4], { z0.h - z3.h }, z4.h[7]
+; CHECK: bfmls za.h[w8, 0, vgx4], { z0.h - z3.h }, z4.h[7]
+; CHECK: ret
+ <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+ <vscale x 8 x bfloat> %a2, <vscale x 8 x bfloat> %a3,
+ <vscale x 8 x bfloat> %b) #0 {
+ call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv8bf16(i32 %slice,
+ <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+ <vscale x 8 x bfloat> %a2, <vscale x 8 x bfloat> %a3,
+ <vscale x 8 x bfloat> %b, i32 7);
+ %slice.7 = add i32 %slice, 7
+ call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv8bf16(i32 %slice,
+ <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+ <vscale x 8 x bfloat> %a2, <vscale x 8 x bfloat> %a3,
+ <vscale x 8 x bfloat> %b, i32 7);
+ ret void
+}
+
+attributes #0 = { nounwind "target-features"="+sme2p1,+sme-f16f16,+b16b16" }
>From 6ee3f45b72ebb1d508624b615d110d2b6183fc83 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Wed, 1 May 2024 10:59:49 +0100
Subject: [PATCH 2/3] [fixup] Fix incorrect dependency on sme2p1
---
clang/include/clang/Basic/arm_sme.td | 2 +-
.../acle_sme2_fmlas16.c | 8 +++----
.../acle_sme2_fmlas16.c | 24 +++++++++----------
3 files changed, 17 insertions(+), 17 deletions(-)
diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index 29bc627e7481a..77ea53fb83fac 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -458,7 +458,7 @@ let TargetGuard = "sme2,sme-f64f64" in {
def SVMLS_LANE_VG1x4_F64 : Inst<"svmls_lane_za64[_{d}]_vg1x4", "vm4di", "d", MergeNone, "aarch64_sme_fmls_lane_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_1>]>;
}
-let TargetGuard = "sme2p1,sme-f16f16" in {
+let TargetGuard = "sme-f16f16" in {
def SVMLA_MULTI_VG1x2_F16 : Inst<"svmla_za16[_f16]_vg1x2", "vm22", "h", MergeNone, "aarch64_sme_fmla_vg1x2", [IsStreaming, IsInOutZA], []>;
def SVMLA_MULTI_VG1x4_F16 : Inst<"svmla_za16[_f16]_vg1x4", "vm44", "h", MergeNone, "aarch64_sme_fmla_vg1x4", [IsStreaming, IsInOutZA], []>;
def SVMLS_MULTI_VG1x2_F16 : Inst<"svmls_za16[_f16]_vg1x2", "vm22", "h", MergeNone, "aarch64_sme_fmls_vg1x2", [IsStreaming, IsInOutZA], []>;
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c
index 81ab3e62276b2..a059f458ac2c8 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c
@@ -1,8 +1,8 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +sme-f16f16 -target-feature +b16b16 -O2 -S -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -x c++ -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +sme-f16f16 -target-feature +b16b16 -O2 -S -Werror -Wall -emit-llvm -o - %s | FileCheck %s --check-prefix CHECK-CXX
-// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +sme-f16f16 -target-feature +b16b16 -O2 -S -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -x c++ -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +sme-f16f16 -target-feature +b16b16 -O2 -S -Werror -Wall -emit-llvm -o - %s | FileCheck %s --check-prefix CHECK-CXX
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f16f16 -target-feature +b16b16 -O2 -S -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -x c++ -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f16f16 -target-feature +b16b16 -O2 -S -Werror -Wall -emit-llvm -o - %s | FileCheck %s --check-prefix CHECK-CXX
+// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f16f16 -target-feature +b16b16 -O2 -S -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -x c++ -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f16f16 -target-feature +b16b16 -O2 -S -Werror -Wall -emit-llvm -o - %s | FileCheck %s --check-prefix CHECK-CXX
// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +sme-f16f16 -target-feature +b16b16 -O2 -S -Werror -Wall %s -o /dev/null
diff --git a/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c
index c6cf8a518fa79..2be49ab040fc0 100644
--- a/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c
+++ b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c
@@ -14,29 +14,29 @@ void test_features_f16f16(uint32_t slice,
svbfloat16x4_t bzn4, svbfloat16x4_t bzm4)
__arm_streaming __arm_inout("za") {
- // expected-error at +1 {{'svmla_single_za16_f16_vg1x2' needs target feature sme2p1,sme-f16f16}}
+ // expected-error at +1 {{'svmla_single_za16_f16_vg1x2' needs target feature sme-f16f16}}
svmla_single_za16_f16_vg1x2(slice, zn2, zm);
- // expected-error at +1 {{'svmla_single_za16_f16_vg1x4' needs target feature sme2p1,sme-f16f16}}
+ // expected-error at +1 {{'svmla_single_za16_f16_vg1x4' needs target feature sme-f16f16}}
svmla_single_za16_f16_vg1x4(slice, zn4, zm);
- // expected-error at +1 {{'svmls_single_za16_f16_vg1x2' needs target feature sme2p1,sme-f16f16}}
+ // expected-error at +1 {{'svmls_single_za16_f16_vg1x2' needs target feature sme-f16f16}}
svmls_single_za16_f16_vg1x2(slice, zn2, zm);
- // expected-error at +1 {{'svmls_single_za16_f16_vg1x4' needs target feature sme2p1,sme-f16f16}}
+ // expected-error at +1 {{'svmls_single_za16_f16_vg1x4' needs target feature sme-f16f16}}
svmls_single_za16_f16_vg1x4(slice, zn4, zm);
- // expected-error at +1 {{'svmla_za16_f16_vg1x2' needs target feature sme2p1,sme-f16f16}}
+ // expected-error at +1 {{'svmla_za16_f16_vg1x2' needs target feature sme-f16f16}}
svmla_za16_f16_vg1x2(slice, zn2, zm2);
- // expected-error at +1 {{'svmla_za16_f16_vg1x4' needs target feature sme2p1,sme-f16f16}}
+ // expected-error at +1 {{'svmla_za16_f16_vg1x4' needs target feature sme-f16f16}}
svmla_za16_f16_vg1x4(slice, zn4, zm4);
- // expected-error at +1 {{'svmls_za16_f16_vg1x2' needs target feature sme2p1,sme-f16f16}}
+ // expected-error at +1 {{'svmls_za16_f16_vg1x2' needs target feature sme-f16f16}}
svmls_za16_f16_vg1x2(slice, zn2, zm2);
- // expected-error at +1 {{'svmls_za16_f16_vg1x4' needs target feature sme2p1,sme-f16f16}}
+ // expected-error at +1 {{'svmls_za16_f16_vg1x4' needs target feature sme-f16f16}}
svmls_za16_f16_vg1x4(slice, zn4, zm4);
- // expected-error at +1 {{'svmla_lane_za16_f16_vg1x2' needs target feature sme2p1,sme-f16f16}}
+ // expected-error at +1 {{'svmla_lane_za16_f16_vg1x2' needs target feature sme-f16f16}}
svmla_lane_za16_f16_vg1x2(slice, zn2, zm, 7);
- // expected-error at +1 {{'svmla_lane_za16_f16_vg1x4' needs target feature sme2p1,sme-f16f16}}
+ // expected-error at +1 {{'svmla_lane_za16_f16_vg1x4' needs target feature sme-f16f16}}
svmla_lane_za16_f16_vg1x4(slice, zn4, zm, 7);
- // expected-error at +1 {{'svmls_lane_za16_f16_vg1x2' needs target feature sme2p1,sme-f16f16}}
+ // expected-error at +1 {{'svmls_lane_za16_f16_vg1x2' needs target feature sme-f16f16}}
svmls_lane_za16_f16_vg1x2(slice, zn2, zm, 7);
- // expected-error at +1 {{'svmls_lane_za16_f16_vg1x4' needs target feature sme2p1,sme-f16f16}}
+ // expected-error at +1 {{'svmls_lane_za16_f16_vg1x4' needs target feature sme-f16f16}}
svmls_lane_za16_f16_vg1x4(slice, zn4, zm, 7);
// expected-error at +1 {{'svmla_single_za16_bf16_vg1x2' needs target feature sme2,b16b16}}
>From 42f10b39d25df4b4fb9bff1a4847a9f6a827f332 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Thu, 9 May 2024 16:47:36 +0100
Subject: [PATCH 3/3] [fixup] Fix test RUN lines
---
.../CodeGen/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c | 8 ++++----
.../test/Sema/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c | 2 +-
2 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c
index a059f458ac2c8..ecc4155454145 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c
@@ -1,8 +1,8 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f16f16 -target-feature +b16b16 -O2 -S -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -x c++ -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f16f16 -target-feature +b16b16 -O2 -S -Werror -Wall -emit-llvm -o - %s | FileCheck %s --check-prefix CHECK-CXX
-// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f16f16 -target-feature +b16b16 -O2 -S -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -x c++ -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f16f16 -target-feature +b16b16 -O2 -S -Werror -Wall -emit-llvm -o - %s | FileCheck %s --check-prefix CHECK-CXX
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f16f16 -target-feature +b16b16 -O2 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -x c++ -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f16f16 -target-feature +b16b16 -O2 -Werror -Wall -emit-llvm -o - %s | FileCheck %s --check-prefix CHECK-CXX
+// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f16f16 -target-feature +b16b16 -O2 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -x c++ -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f16f16 -target-feature +b16b16 -O2 -Werror -Wall -emit-llvm -o - %s | FileCheck %s --check-prefix CHECK-CXX
// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +sme-f16f16 -target-feature +b16b16 -O2 -S -Werror -Wall %s -o /dev/null
diff --git a/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c
index 2be49ab040fc0..b1582569971d4 100644
--- a/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c
+++ b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -verify -S -emit-llvm %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -verify -emit-llvm %s
// REQUIRES: aarch64-registered-target
More information about the cfe-commits
mailing list