[clang] [llvm] [AArch64] Add intrinsics for 16-bit non-widening FMLA/FMLS (PR #88553)

Fri Apr 12 11:40:35 PDT 2024

https://github.com/momchil-velikov created https://github.com/llvm/llvm-project/pull/88553

None

>From 531fd1fadc8739d486a52629fdf73ec4e456db0e Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Fri, 12 Apr 2024 19:35:47 +0100
Subject: [PATCH] [AArch64] Add intrinsics for 16-bit non-widening FMLA/FMLS

---
 clang/include/clang/Basic/arm_sme.td          |  34 +
 .../acle_sme2_fmlas16.c                       | 592 ++++++++++++++++++
 .../acle_sme2_fmlas16.c                       |  90 +++
 .../lib/Target/AArch64/AArch64SMEInstrInfo.td |  57 +-
 llvm/lib/Target/AArch64/SMEInstrFormats.td    |  54 +-
 .../AArch64/sme2-intrinsics-fmlas16.ll        | 462 ++++++++++++++
 6 files changed, 1254 insertions(+), 35 deletions(-)
 create mode 100644 clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c
 create mode 100644 clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c
 create mode 100644 llvm/test/CodeGen/AArch64/sme2-intrinsics-fmlas16.ll

diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index 1ac6d5170ea283..29bc627e7481ab 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -458,6 +458,40 @@ let TargetGuard = "sme2,sme-f64f64" in {
   def SVMLS_LANE_VG1x4_F64 : Inst<"svmls_lane_za64[_{d}]_vg1x4", "vm4di", "d", MergeNone, "aarch64_sme_fmls_lane_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_1>]>;
 }
 
+let TargetGuard = "sme2p1,sme-f16f16" in {
+  def SVMLA_MULTI_VG1x2_F16 : Inst<"svmla_za16[_f16]_vg1x2", "vm22", "h", MergeNone, "aarch64_sme_fmla_vg1x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLA_MULTI_VG1x4_F16 : Inst<"svmla_za16[_f16]_vg1x4", "vm44", "h", MergeNone, "aarch64_sme_fmla_vg1x4", [IsStreaming, IsInOutZA], []>;
+  def SVMLS_MULTI_VG1x2_F16 : Inst<"svmls_za16[_f16]_vg1x2", "vm22", "h", MergeNone, "aarch64_sme_fmls_vg1x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLS_MULTI_VG1x4_F16 : Inst<"svmls_za16[_f16]_vg1x4", "vm44", "h", MergeNone, "aarch64_sme_fmls_vg1x4", [IsStreaming, IsInOutZA], []>;
+
+  def SVMLA_SINGLE_VG1x2_F16 : Inst<"svmla[_single]_za16[_f16]_vg1x2", "vm2d", "h", MergeNone, "aarch64_sme_fmla_single_vg1x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLA_SINGLE_VG1x4_F16 : Inst<"svmla[_single]_za16[_f16]_vg1x4", "vm4d", "h", MergeNone, "aarch64_sme_fmla_single_vg1x4", [IsStreaming, IsInOutZA], []>;
+  def SVMLS_SINGLE_VG1x2_F16 : Inst<"svmls[_single]_za16[_f16]_vg1x2", "vm2d", "h", MergeNone, "aarch64_sme_fmls_single_vg1x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLS_SINGLE_VG1x4_F16 : Inst<"svmls[_single]_za16[_f16]_vg1x4", "vm4d", "h", MergeNone, "aarch64_sme_fmls_single_vg1x4", [IsStreaming, IsInOutZA], []>;
+
+  def SVMLA_LANE_VG1x2_F16 : Inst<"svmla_lane_za16[_f16]_vg1x2", "vm2di", "h", MergeNone, "aarch64_sme_fmla_lane_vg1x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLA_LANE_VG1x4_F16 : Inst<"svmla_lane_za16[_f16]_vg1x4", "vm4di", "h", MergeNone, "aarch64_sme_fmla_lane_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLS_LANE_VG1x2_F16 : Inst<"svmls_lane_za16[_f16]_vg1x2", "vm2di", "h", MergeNone, "aarch64_sme_fmls_lane_vg1x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLS_LANE_VG1x4_F16 : Inst<"svmls_lane_za16[_f16]_vg1x4", "vm4di", "h", MergeNone, "aarch64_sme_fmls_lane_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+}
+
+let TargetGuard = "sme2,b16b16" in {
+  def SVMLA_MULTI_VG1x2_BF16 : Inst<"svmla_za16[_bf16]_vg1x2", "vm22", "b", MergeNone, "aarch64_sme_fmla_vg1x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLA_MULTI_VG1x4_BF16 : Inst<"svmla_za16[_bf16]_vg1x4", "vm44", "b", MergeNone, "aarch64_sme_fmla_vg1x4", [IsStreaming, IsInOutZA], []>;
+  def SVMLS_MULTI_VG1x2_BF16 : Inst<"svmls_za16[_bf16]_vg1x2", "vm22", "b", MergeNone, "aarch64_sme_fmls_vg1x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLS_MULTI_VG1x4_BF16 : Inst<"svmls_za16[_bf16]_vg1x4", "vm44", "b", MergeNone, "aarch64_sme_fmls_vg1x4", [IsStreaming, IsInOutZA], []>;
+
+  def SVMLA_SINGLE_VG1x2_BF16 : Inst<"svmla[_single]_za16[_bf16]_vg1x2", "vm2d", "b", MergeNone, "aarch64_sme_fmla_single_vg1x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLA_SINGLE_VG1x4_BF16 : Inst<"svmla[_single]_za16[_bf16]_vg1x4", "vm4d", "b", MergeNone, "aarch64_sme_fmla_single_vg1x4", [IsStreaming, IsInOutZA], []>;
+  def SVMLS_SINGLE_VG1x2_BF16 : Inst<"svmls[_single]_za16[_bf16]_vg1x2", "vm2d", "b", MergeNone, "aarch64_sme_fmls_single_vg1x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLS_SINGLE_VG1x4_BF16 : Inst<"svmls[_single]_za16[_bf16]_vg1x4", "vm4d", "b", MergeNone, "aarch64_sme_fmls_single_vg1x4", [IsStreaming, IsInOutZA], []>;
+
+  def SVMLA_LANE_VG1x2_BF16 : Inst<"svmla_lane_za16[_bf16]_vg1x2", "vm2di", "b", MergeNone, "aarch64_sme_fmla_lane_vg1x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLA_LANE_VG1x4_BF16 : Inst<"svmla_lane_za16[_bf16]_vg1x4", "vm4di", "b", MergeNone, "aarch64_sme_fmla_lane_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLS_LANE_VG1x2_BF16 : Inst<"svmls_lane_za16[_bf16]_vg1x2", "vm2di", "b", MergeNone, "aarch64_sme_fmls_lane_vg1x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLS_LANE_VG1x4_BF16 : Inst<"svmls_lane_za16[_bf16]_vg1x4", "vm4di", "b", MergeNone, "aarch64_sme_fmls_lane_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+}
+
 // FMLAL/FMLSL/UMLAL/SMLAL
 // SMLALL/UMLALL/USMLALL/SUMLALL
 let TargetGuard = "sme2" in {
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c
new file mode 100644
index 00000000000000..81ab3e62276b2b
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c
@@ -0,0 +1,592 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// RUN: %clang_cc1                               -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +sme-f16f16 -target-feature +b16b16 -O2 -S -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1                        -x c++ -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +sme-f16f16 -target-feature +b16b16 -O2 -S -Werror -Wall -emit-llvm -o - %s | FileCheck %s --check-prefix CHECK-CXX
+// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS        -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +sme-f16f16 -target-feature +b16b16 -O2 -S -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -x c++ -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +sme-f16f16 -target-feature +b16b16 -O2 -S -Werror -Wall -emit-llvm -o - %s | FileCheck %s --check-prefix CHECK-CXX
+
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +sme-f16f16 -target-feature +b16b16 -O2 -S -Werror -Wall %s -o /dev/null
+
+// REQUIRES: aarch64-registered-target
+#include <arm_sme.h>
+
+#ifdef SME_OVERLOADED_FORMS
+#define SME_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED, A5) A1##A3##A5
+#else
+#define SME_ACLE_FUNC(A1, A2, A3, A4, A5) A1##A2##A3##A4##A5
+#endif
+
+// CHECK-LABEL: define dso_local void @test_svmla_single_za16_f16_vg1x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z32test_svmla_single_za16_f16_vg1x2j13svfloat16x2_tu13__SVFloat16_t(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmla_single_za16_f16_vg1x2(uint32_t slice, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmla,_single,_za16,_f16,_vg1x2)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmla_single_za16_f16_vg1x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z32test_svmla_single_za16_f16_vg1x4j13svfloat16x4_tu13__SVFloat16_t(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmla_single_za16_f16_vg1x4(uint32_t slice, svfloat16x4_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmla,_single,_za16,_f16,_vg1x4)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmls_single_za16_f16_vg1x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z32test_svmls_single_za16_f16_vg1x2j13svfloat16x2_tu13__SVFloat16_t(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmls_single_za16_f16_vg1x2(uint32_t slice, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmls,_single,_za16,_f16,_vg1x2)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmls_single_za16_f16_vg1x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z32test_svmls_single_za16_f16_vg1x4j13svfloat16x4_tu13__SVFloat16_t(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmls_single_za16_f16_vg1x4(uint32_t slice, svfloat16x4_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmls,_single,_za16,_f16,_vg1x4)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmla_za16_f16_vg1x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x half> [[ZN:%.*]], <vscale x 16 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.vg1x2.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z25test_svmla_za16_f16_vg1x2j13svfloat16x2_tS_(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x half> [[ZN:%.*]], <vscale x 16 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM]], i64 8)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmla.vg1x2.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmla_za16_f16_vg1x2(uint32_t slice, svfloat16x2_t zn, svfloat16x2_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmla,,_za16,_f16,_vg1x2)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmla_za16_f16_vg1x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x half> [[ZN:%.*]], <vscale x 32 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.vg1x4.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z25test_svmla_za16_f16_vg1x4j13svfloat16x4_tS_(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x half> [[ZN:%.*]], <vscale x 32 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 8)
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 16)
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 24)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmla.vg1x4.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmla_za16_f16_vg1x4(uint32_t slice, svfloat16x4_t zn, svfloat16x4_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmla,,_za16,_f16,_vg1x4)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmls_za16_f16_vg1x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x half> [[ZN:%.*]], <vscale x 16 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.vg1x2.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z25test_svmls_za16_f16_vg1x2j13svfloat16x2_tS_(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x half> [[ZN:%.*]], <vscale x 16 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM]], i64 8)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmls.vg1x2.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmls_za16_f16_vg1x2(uint32_t slice, svfloat16x2_t zn, svfloat16x2_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmls,,_za16,_f16,_vg1x2)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmls_za16_f16_vg1x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x half> [[ZN:%.*]], <vscale x 32 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.vg1x4.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z25test_svmls_za16_f16_vg1x4j13svfloat16x4_tS_(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x half> [[ZN:%.*]], <vscale x 32 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 8)
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 16)
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 24)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmls.vg1x4.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmls_za16_f16_vg1x4(uint32_t slice, svfloat16x4_t zn, svfloat16x4_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmls,,_za16,_f16,_vg1x4)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmla_lane_za16_f16_vg1x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z30test_svmla_lane_za16_f16_vg1x2j13svfloat16x2_tu13__SVFloat16_t(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM]], i32 7)
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmla_lane_za16_f16_vg1x2(uint32_t slice, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmla_lane,,_za16,_f16,_vg1x2)(slice, zn, zm, 7);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmla_lane_za16_f16_vg1x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z30test_svmla_lane_za16_f16_vg1x4j13svfloat16x4_tu13__SVFloat16_t(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM]], i32 7)
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmla_lane_za16_f16_vg1x4(uint32_t slice, svfloat16x4_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmla_lane,,_za16,_f16,_vg1x4)(slice, zn, zm, 7);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmls_lane_za16_f16_vg1x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z30test_svmls_lane_za16_f16_vg1x2j13svfloat16x2_tu13__SVFloat16_t(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM]], i32 7)
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmls_lane_za16_f16_vg1x2(uint32_t slice, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmls_lane,,_za16,_f16,_vg1x2)(slice, zn, zm, 7);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmls_lane_za16_f16_vg1x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z30test_svmls_lane_za16_f16_vg1x4j13svfloat16x4_tu13__SVFloat16_t(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM]], i32 7)
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmls_lane_za16_f16_vg1x4(uint32_t slice, svfloat16x4_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmls_lane,,_za16,_f16,_vg1x4)(slice, zn, zm, 7);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmla_single_za16_bf16_vg1x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z33test_svmla_single_za16_bf16_vg1x2j14svbfloat16x2_tu14__SVBfloat16_t(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmla_single_za16_bf16_vg1x2(uint32_t slice, svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmla, _single, _za16, _bf16, _vg1x2)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmla_single_za16_bf16_vg1x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z33test_svmla_single_za16_bf16_vg1x4j14svbfloat16x4_tu14__SVBfloat16_t(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmla_single_za16_bf16_vg1x4(uint32_t slice, svbfloat16x4_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmla, _single, _za16, _bf16, _vg1x4)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmls_single_za16_bf16_vg1x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z33test_svmls_single_za16_bf16_vg1x2j14svbfloat16x2_tu14__SVBfloat16_t(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmls_single_za16_bf16_vg1x2(uint32_t slice, svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmls, _single, _za16, _bf16, _vg1x2)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmls_single_za16_bf16_vg1x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z33test_svmls_single_za16_bf16_vg1x4j14svbfloat16x4_tu14__SVBfloat16_t(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmls_single_za16_bf16_vg1x4(uint32_t slice, svbfloat16x4_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmls, _single, _za16, _bf16, _vg1x4)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmla_za16_bf16_vg1x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x bfloat> [[ZN:%.*]], <vscale x 16 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.vg1x2.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z26test_svmla_za16_bf16_vg1x2j14svbfloat16x2_tS_(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x bfloat> [[ZN:%.*]], <vscale x 16 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM]], i64 8)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmla.vg1x2.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmla_za16_bf16_vg1x2(uint32_t slice, svbfloat16x2_t zn, svbfloat16x2_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmla, , _za16, _bf16, _vg1x2)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmla_za16_bf16_vg1x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x bfloat> [[ZN:%.*]], <vscale x 32 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.vg1x4.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z26test_svmla_za16_bf16_vg1x4j14svbfloat16x4_tS_(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x bfloat> [[ZN:%.*]], <vscale x 32 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 8)
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 16)
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 24)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmla.vg1x4.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmla_za16_bf16_vg1x4(uint32_t slice, svbfloat16x4_t zn, svbfloat16x4_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmla, , _za16, _bf16, _vg1x4)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmls_za16_bf16_vg1x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x bfloat> [[ZN:%.*]], <vscale x 16 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.vg1x2.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z26test_svmls_za16_bf16_vg1x2j14svbfloat16x2_tS_(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x bfloat> [[ZN:%.*]], <vscale x 16 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM]], i64 8)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmls.vg1x2.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmls_za16_bf16_vg1x2(uint32_t slice, svbfloat16x2_t zn, svbfloat16x2_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmls, , _za16, _bf16, _vg1x2)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmls_za16_bf16_vg1x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x bfloat> [[ZN:%.*]], <vscale x 32 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.vg1x4.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z26test_svmls_za16_bf16_vg1x4j14svbfloat16x4_tS_(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x bfloat> [[ZN:%.*]], <vscale x 32 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 8)
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 16)
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 24)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmls.vg1x4.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmls_za16_bf16_vg1x4(uint32_t slice, svbfloat16x4_t zn, svbfloat16x4_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmls, , _za16, _bf16, _vg1x4)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmla_lane_za16_bf16_vg1x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z31test_svmla_lane_za16_bf16_vg1x2j14svbfloat16x2_tu14__SVBfloat16_t(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM]], i32 7)
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmla_lane_za16_bf16_vg1x2(uint32_t slice, svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmla_lane, , _za16, _bf16, _vg1x2)(slice, zn, zm, 7);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmla_lane_za16_bf16_vg1x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z31test_svmla_lane_za16_bf16_vg1x4j14svbfloat16x4_tu14__SVBfloat16_t(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM]], i32 7)
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmla_lane_za16_bf16_vg1x4(uint32_t slice, svbfloat16x4_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmla_lane, , _za16, _bf16, _vg1x4)(slice, zn, zm, 7);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmls_lane_za16_bf16_vg1x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z31test_svmls_lane_za16_bf16_vg1x2j14svbfloat16x2_tu14__SVBfloat16_t(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM]], i32 7)
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmls_lane_za16_bf16_vg1x2(uint32_t slice, svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmls_lane, , _za16, _bf16, _vg1x2)(slice, zn, zm, 7);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmls_lane_za16_bf16_vg1x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z31test_svmls_lane_za16_bf16_vg1x4j14svbfloat16x4_tu14__SVBfloat16_t(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM]], i32 7)
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmls_lane_za16_bf16_vg1x4(uint32_t slice, svbfloat16x4_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmls_lane, , _za16, _bf16, _vg1x4)(slice, zn, zm, 7);
+}
diff --git a/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c
new file mode 100644
index 00000000000000..c6cf8a518fa792
--- /dev/null
+++ b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c
@@ -0,0 +1,90 @@
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -verify -S -emit-llvm %s
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_sme.h>
+
+
+void test_features_f16f16(uint32_t slice,
+                          svfloat16_t zm,
+                          svfloat16x2_t zn2, svfloat16x2_t zm2,
+                          svfloat16x4_t zn4, svfloat16x4_t zm4,
+                          svbfloat16_t bzm,
+                          svbfloat16x2_t bzn2, svbfloat16x2_t bzm2,
+                          svbfloat16x4_t bzn4, svbfloat16x4_t bzm4)
+
+   __arm_streaming __arm_inout("za") {
+  // expected-error at +1 {{'svmla_single_za16_f16_vg1x2' needs target feature sme2p1,sme-f16f16}}
+  svmla_single_za16_f16_vg1x2(slice, zn2, zm);
+  // expected-error at +1 {{'svmla_single_za16_f16_vg1x4' needs target feature sme2p1,sme-f16f16}}
+  svmla_single_za16_f16_vg1x4(slice, zn4, zm);
+  // expected-error at +1 {{'svmls_single_za16_f16_vg1x2' needs target feature sme2p1,sme-f16f16}}
+  svmls_single_za16_f16_vg1x2(slice, zn2, zm);
+  // expected-error at +1 {{'svmls_single_za16_f16_vg1x4' needs target feature sme2p1,sme-f16f16}}
+  svmls_single_za16_f16_vg1x4(slice, zn4, zm);
+  // expected-error at +1 {{'svmla_za16_f16_vg1x2' needs target feature sme2p1,sme-f16f16}}
+  svmla_za16_f16_vg1x2(slice, zn2, zm2);
+  // expected-error at +1 {{'svmla_za16_f16_vg1x4' needs target feature sme2p1,sme-f16f16}}
+  svmla_za16_f16_vg1x4(slice, zn4, zm4);
+  // expected-error at +1 {{'svmls_za16_f16_vg1x2' needs target feature sme2p1,sme-f16f16}}
+  svmls_za16_f16_vg1x2(slice, zn2, zm2);
+  // expected-error at +1 {{'svmls_za16_f16_vg1x4' needs target feature sme2p1,sme-f16f16}}
+  svmls_za16_f16_vg1x4(slice, zn4, zm4);
+  // expected-error at +1 {{'svmla_lane_za16_f16_vg1x2' needs target feature sme2p1,sme-f16f16}}
+  svmla_lane_za16_f16_vg1x2(slice, zn2, zm, 7);
+  // expected-error at +1 {{'svmla_lane_za16_f16_vg1x4' needs target feature sme2p1,sme-f16f16}}
+  svmla_lane_za16_f16_vg1x4(slice, zn4, zm, 7);
+  // expected-error at +1 {{'svmls_lane_za16_f16_vg1x2' needs target feature sme2p1,sme-f16f16}}
+  svmls_lane_za16_f16_vg1x2(slice, zn2, zm, 7);
+  // expected-error at +1 {{'svmls_lane_za16_f16_vg1x4' needs target feature sme2p1,sme-f16f16}}
+  svmls_lane_za16_f16_vg1x4(slice, zn4, zm, 7);
+
+  // expected-error at +1 {{'svmla_single_za16_bf16_vg1x2' needs target feature sme2,b16b16}}
+  svmla_single_za16_bf16_vg1x2(slice, bzn2, bzm);
+  // expected-error at +1 {{'svmla_single_za16_bf16_vg1x4' needs target feature sme2,b16b16}}
+  svmla_single_za16_bf16_vg1x4(slice, bzn4, bzm);
+  // expected-error at +1 {{'svmls_single_za16_bf16_vg1x2' needs target feature sme2,b16b16}}
+  svmls_single_za16_bf16_vg1x2(slice, bzn2, bzm);
+  // expected-error at +1 {{'svmls_single_za16_bf16_vg1x4' needs target feature sme2,b16b16}}
+  svmls_single_za16_bf16_vg1x4(slice, bzn4, bzm);
+  // expected-error at +1 {{'svmla_za16_bf16_vg1x2' needs target feature sme2,b16b16}}
+  svmla_za16_bf16_vg1x2(slice, bzn2, bzm2);
+  // expected-error at +1 {{'svmla_za16_bf16_vg1x4' needs target feature sme2,b16b16}}
+  svmla_za16_bf16_vg1x4(slice, bzn4, bzm4);
+  // expected-error at +1 {{'svmls_za16_bf16_vg1x2' needs target feature sme2,b16b16}}
+  svmls_za16_bf16_vg1x2(slice, bzn2, bzm2);
+  // expected-error at +1 {{'svmls_za16_bf16_vg1x4' needs target feature sme2,b16b16}}
+  svmls_za16_bf16_vg1x4(slice, bzn4, bzm4);
+  // expected-error at +1 {{'svmla_lane_za16_bf16_vg1x2' needs target feature sme2,b16b16}}
+  svmla_lane_za16_bf16_vg1x2(slice, bzn2, bzm, 7);
+  // expected-error at +1 {{'svmla_lane_za16_bf16_vg1x4' needs target feature sme2,b16b16}}
+  svmla_lane_za16_bf16_vg1x4(slice, bzn4, bzm, 7);
+  // expected-error at +1 {{'svmls_lane_za16_bf16_vg1x2' needs target feature sme2,b16b16}}
+  svmls_lane_za16_bf16_vg1x2(slice, bzn2, bzm, 7);
+  // expected-error at +1 {{'svmls_lane_za16_bf16_vg1x4' needs target feature sme2,b16b16}}
+  svmls_lane_za16_bf16_vg1x4(slice, bzn4, bzm, 7);
+}
+
+
+void test_imm(uint32_t slice, svfloat16_t zm, svfloat16x2_t zn2,svfloat16x4_t zn4,
+              svbfloat16_t bzm, svbfloat16x2_t bzn2, svbfloat16x4_t bzn4)
+  __arm_streaming __arm_inout("za") {
+
+  // expected-error at +1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}}
+  svmla_lane_za16_f16_vg1x2(slice, zn2, zm, -1);
+  // expected-error at +1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}}
+  svmla_lane_za16_f16_vg1x4(slice, zn4, zm, -1);
+  // expected-error at +1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}}
+  svmls_lane_za16_f16_vg1x2(slice, zn2, zm, -1);
+  // expected-error at +1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}}
+  svmls_lane_za16_f16_vg1x4(slice, zn4, zm, -1);
+
+  // expected-error at +1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}}
+  svmla_lane_za16_bf16_vg1x2(slice, bzn2, bzm, -1);
+  // expected-error at +1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}}
+  svmla_lane_za16_bf16_vg1x4(slice, bzn4, bzm, -1);
+  // expected-error at +1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}}
+  svmls_lane_za16_bf16_vg1x2(slice, bzn2, bzm, -1);
+  // expected-error at +1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}}
+  svmls_lane_za16_bf16_vg1x4(slice, bzn4, bzm, -1);
+}
\ No newline at end of file
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 2db0fa25343450..fb33cf614b5956 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -798,19 +798,19 @@ defm FADD_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"fadd", 0b0100, MatrixOp16
 defm FSUB_VG2_M2Z_H : sme2_multivec_accum_add_sub_vg2<"fsub", 0b0101, MatrixOp16, ZZ_h_mul_r, nxv8f16, null_frag>;
 defm FSUB_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"fsub", 0b0101, MatrixOp16, ZZZZ_h_mul_r, nxv8f16, null_frag>;
 
-defm FMLA_VG2_M2ZZI_H : sme2p1_multi_vec_array_vg2_index_16b<"fmla", 0b00, 0b100, ZZ_h_mul_r, ZPR4b16>;
-defm FMLA_VG4_M4ZZI_H : sme2p1_multi_vec_array_vg4_index_16b<"fmla", 0b000, ZZZZ_h_mul_r, ZPR4b16>;
-defm FMLA_VG2_M2ZZ_H :  sme2_dot_mla_add_sub_array_vg24_single<"fmla", 0b0011100, MatrixOp16, ZZ_h, ZPR4b16>;
-defm FMLA_VG4_M4ZZ_H :  sme2_dot_mla_add_sub_array_vg24_single<"fmla", 0b0111100, MatrixOp16, ZZZZ_h, ZPR4b16>;
-defm FMLA_VG2_M2Z4Z_H : sme2_dot_mla_add_sub_array_vg2_multi<"fmla", 0b0100001, MatrixOp16, ZZ_h_mul_r, nxv8f16, null_frag>;
-defm FMLA_VG4_M4Z4Z_H : sme2_dot_mla_add_sub_array_vg4_multi<"fmla", 0b0100001, MatrixOp16, ZZZZ_h_mul_r, nxv8f16, null_frag>;
-
-defm FMLS_VG2_M2ZZI_H : sme2p1_multi_vec_array_vg2_index_16b<"fmls", 0b00, 0b101, ZZ_h_mul_r, ZPR4b16>;
-defm FMLS_VG4_M4ZZI_H : sme2p1_multi_vec_array_vg4_index_16b<"fmls", 0b001, ZZZZ_h_mul_r, ZPR4b16>;
-defm FMLS_VG2_M2ZZ_H :  sme2_dot_mla_add_sub_array_vg24_single<"fmls", 0b0011101, MatrixOp16, ZZ_h, ZPR4b16>;
-defm FMLS_VG4_M4ZZ_H :  sme2_dot_mla_add_sub_array_vg24_single<"fmls", 0b0111101, MatrixOp16, ZZZZ_h, ZPR4b16>;
-defm FMLS_VG2_M2Z2Z_H : sme2_dot_mla_add_sub_array_vg2_multi<"fmls", 0b0100011, MatrixOp16, ZZ_h_mul_r, nxv8f16, null_frag>;
-defm FMLS_VG4_M4Z2Z_H : sme2_dot_mla_add_sub_array_vg4_multi<"fmls", 0b0100011, MatrixOp16, ZZZZ_h_mul_r, nxv8f16, null_frag>;
+defm FMLA_VG2_M2ZZI_H : sme2p1_multi_vec_array_vg2_index_16b<"fmla", 0b00, 0b100, ZZ_h_mul_r, ZPR4b16, nxv8f16, int_aarch64_sme_fmla_lane_vg1x2>;
+defm FMLA_VG4_M4ZZI_H : sme2p1_multi_vec_array_vg4_index_16b<"fmla", 0b000, ZZZZ_h_mul_r, ZPR4b16, nxv8f16, int_aarch64_sme_fmla_lane_vg1x4>;
+defm FMLA_VG2_M2ZZ_H :  sme2_dot_mla_add_sub_array_vg2_single<"fmla", 0b0011100, MatrixOp16, ZZ_h, ZPR4b16, nxv8f16, int_aarch64_sme_fmla_single_vg1x2>;
+defm FMLA_VG4_M4ZZ_H :  sme2_dot_mla_add_sub_array_vg4_single<"fmla", 0b0111100, MatrixOp16, ZZZZ_h, ZPR4b16, nxv8f16, int_aarch64_sme_fmla_single_vg1x4>;
+defm FMLA_VG2_M2Z4Z_H : sme2_dot_mla_add_sub_array_vg2_multi<"fmla", 0b0100001, MatrixOp16, ZZ_h_mul_r, nxv8f16, int_aarch64_sme_fmla_vg1x2>;
+defm FMLA_VG4_M4Z4Z_H : sme2_dot_mla_add_sub_array_vg4_multi<"fmla", 0b0100001, MatrixOp16, ZZZZ_h_mul_r, nxv8f16, int_aarch64_sme_fmla_vg1x4>;
+
+defm FMLS_VG2_M2ZZI_H : sme2p1_multi_vec_array_vg2_index_16b<"fmls", 0b00, 0b101, ZZ_h_mul_r, ZPR4b16, nxv8f16, int_aarch64_sme_fmls_lane_vg1x2>;
+defm FMLS_VG4_M4ZZI_H : sme2p1_multi_vec_array_vg4_index_16b<"fmls", 0b001, ZZZZ_h_mul_r, ZPR4b16, nxv8f16, int_aarch64_sme_fmls_lane_vg1x4>;
+defm FMLS_VG2_M2ZZ_H :  sme2_dot_mla_add_sub_array_vg2_single<"fmls", 0b0011101, MatrixOp16, ZZ_h, ZPR4b16, nxv8f16, int_aarch64_sme_fmls_single_vg1x2>;
+defm FMLS_VG4_M4ZZ_H :  sme2_dot_mla_add_sub_array_vg4_single<"fmls", 0b0111101, MatrixOp16, ZZZZ_h, ZPR4b16, nxv8f16, int_aarch64_sme_fmls_single_vg1x4>;
+defm FMLS_VG2_M2Z2Z_H : sme2_dot_mla_add_sub_array_vg2_multi<"fmls", 0b0100011, MatrixOp16, ZZ_h_mul_r, nxv8f16, int_aarch64_sme_fmls_vg1x2>;
+defm FMLS_VG4_M4Z2Z_H : sme2_dot_mla_add_sub_array_vg4_multi<"fmls", 0b0100011, MatrixOp16, ZZZZ_h_mul_r, nxv8f16, int_aarch64_sme_fmls_vg1x4>;
 
 defm FCVT_2ZZ_H  : sme2p1_fp_cvt_vector_vg2_single<"fcvt", 0b0>;
 defm FCVTL_2ZZ_H : sme2p1_fp_cvt_vector_vg2_single<"fcvtl", 0b1>;
@@ -825,20 +825,19 @@ defm BFADD_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"bfadd", 0b1100, MatrixOp
 defm BFSUB_VG2_M2Z_H : sme2_multivec_accum_add_sub_vg2<"bfsub", 0b1101, MatrixOp16, ZZ_h_mul_r,  nxv8bf16, null_frag>;
 defm BFSUB_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"bfsub", 0b1101, MatrixOp16, ZZZZ_h_mul_r,  nxv8bf16, null_frag>;
 
-defm BFMLA_VG2_M2ZZI : sme2p1_multi_vec_array_vg2_index_16b<"bfmla", 0b00, 0b110, ZZ_h_mul_r, ZPR4b16>;
-defm BFMLA_VG4_M4ZZI : sme2p1_multi_vec_array_vg4_index_16b<"bfmla", 0b010, ZZZZ_h_mul_r, ZPR4b16>;
-defm BFMLA_VG2_M2ZZ : sme2_dot_mla_add_sub_array_vg24_single<"bfmla", 0b1011100, MatrixOp16, ZZ_h, ZPR4b16>;
-defm BFMLA_VG4_M4ZZ : sme2_dot_mla_add_sub_array_vg24_single<"bfmla", 0b1111100, MatrixOp16, ZZZZ_h, ZPR4b16>;
-defm BFMLA_VG2_M2Z2Z : sme2_dot_mla_add_sub_array_vg2_multi<"bfmla", 0b1100001, MatrixOp16, ZZ_h_mul_r, nxv8bf16, null_frag>;
-defm BFMLA_VG4_M4Z4Z : sme2_dot_mla_add_sub_array_vg4_multi<"bfmla", 0b1100001, MatrixOp16, ZZZZ_h_mul_r, nxv8bf16, null_frag>;
-
-defm BFMLS_VG2_M2ZZI : sme2p1_multi_vec_array_vg2_index_16b<"bfmls", 0b00, 0b111, ZZ_h_mul_r, ZPR4b16>;
-defm BFMLS_VG4_M4ZZI : sme2p1_multi_vec_array_vg4_index_16b<"bfmls", 0b011, ZZZZ_h_mul_r, ZPR4b16>;
-defm BFMLS_VG2_M2ZZ : sme2_dot_mla_add_sub_array_vg24_single<"bfmls", 0b1011101, MatrixOp16, ZZ_h, ZPR4b16>;
-defm BFMLS_VG4_M4ZZ : sme2_dot_mla_add_sub_array_vg24_single<"bfmls", 0b1111101, MatrixOp16, ZZZZ_h, ZPR4b16>;
-defm BFMLS_VG2_M2Z2Z : sme2_dot_mla_add_sub_array_vg2_multi<"bfmls", 0b1100011, MatrixOp16, ZZ_h_mul_r, nxv8bf16, null_frag>;
-defm BFMLS_VG4_M4Z4Z : sme2_dot_mla_add_sub_array_vg4_multi<"bfmls", 0b1100011, MatrixOp16, ZZZZ_h_mul_r, nxv8bf16, null_frag>;
+defm BFMLA_VG2_M2ZZI : sme2p1_multi_vec_array_vg2_index_16b<"bfmla", 0b00, 0b110, ZZ_h_mul_r, ZPR4b16, nxv8bf16, int_aarch64_sme_fmla_lane_vg1x2>;
+defm BFMLA_VG4_M4ZZI : sme2p1_multi_vec_array_vg4_index_16b<"bfmla", 0b010, ZZZZ_h_mul_r, ZPR4b16, nxv8bf16, int_aarch64_sme_fmla_lane_vg1x4>;
+defm BFMLA_VG2_M2ZZ  : sme2_dot_mla_add_sub_array_vg2_single<"bfmla", 0b1011100, MatrixOp16, ZZ_h, ZPR4b16, nxv8bf16, int_aarch64_sme_fmla_single_vg1x2>;
+defm BFMLA_VG4_M4ZZ  : sme2_dot_mla_add_sub_array_vg4_single<"bfmla", 0b1111100, MatrixOp16, ZZZZ_h, ZPR4b16, nxv8bf16, int_aarch64_sme_fmla_single_vg1x4>;
+defm BFMLA_VG2_M2Z2Z : sme2_dot_mla_add_sub_array_vg2_multi<"bfmla", 0b1100001, MatrixOp16, ZZ_h_mul_r, nxv8bf16, int_aarch64_sme_fmla_vg1x2>;
+defm BFMLA_VG4_M4Z4Z : sme2_dot_mla_add_sub_array_vg4_multi<"bfmla", 0b1100001, MatrixOp16, ZZZZ_h_mul_r, nxv8bf16, int_aarch64_sme_fmla_vg1x4>;
 
+defm BFMLS_VG2_M2ZZI : sme2p1_multi_vec_array_vg2_index_16b<"bfmls", 0b00, 0b111, ZZ_h_mul_r, ZPR4b16, nxv8bf16, int_aarch64_sme_fmls_lane_vg1x2>;
+defm BFMLS_VG4_M4ZZI : sme2p1_multi_vec_array_vg4_index_16b<"bfmls", 0b011, ZZZZ_h_mul_r, ZPR4b16, nxv8bf16, int_aarch64_sme_fmls_lane_vg1x4>;
+defm BFMLS_VG2_M2ZZ  : sme2_dot_mla_add_sub_array_vg2_single<"bfmls", 0b1011101, MatrixOp16, ZZ_h, ZPR4b16, nxv8bf16, int_aarch64_sme_fmls_single_vg1x2>;
+defm BFMLS_VG4_M4ZZ  : sme2_dot_mla_add_sub_array_vg4_single<"bfmls", 0b1111101, MatrixOp16, ZZZZ_h, ZPR4b16, nxv8bf16, int_aarch64_sme_fmls_single_vg1x4>;
+defm BFMLS_VG2_M2Z2Z : sme2_dot_mla_add_sub_array_vg2_multi<"bfmls", 0b1100011, MatrixOp16, ZZ_h_mul_r, nxv8bf16, int_aarch64_sme_fmls_vg1x2>;
+defm BFMLS_VG4_M4Z4Z : sme2_dot_mla_add_sub_array_vg4_multi<"bfmls", 0b1100011, MatrixOp16, ZZZZ_h_mul_r, nxv8bf16, int_aarch64_sme_fmls_vg1x4>;
 
 defm BFMAX_VG2_2ZZ  : sme2p1_bf_max_min_vector_vg2_single<"bfmax", 0b0010000>;
 defm BFMAX_VG4_4ZZ  : sme2p1_bf_max_min_vector_vg4_single<"bfmax", 0b0010000>;
@@ -907,9 +906,9 @@ def LUTI4_S_4ZZT2Z  : sme2_luti4_vector_vg4_strided<0b00, 0b00, "luti4">;
 } //[HasSME2p1, HasSME_LUTv2]
 
 let Predicates = [HasSMEF8F16] in {
-defm FVDOT_VG2_M2ZZI_BtoH : sme2p1_multi_vec_array_vg2_index_16b<"fvdot", 0b11, 0b110, ZZ_b_mul_r, ZPR4b8>;
-defm FDOT_VG2_M2ZZI_BtoH  : sme2p1_multi_vec_array_vg2_index_16b<"fdot",    0b11, 0b010, ZZ_b_mul_r, ZPR4b8>;
-defm FDOT_VG4_M4ZZI_BtoH  : sme2p1_multi_vec_array_vg4_index_16b<"fdot",    0b100, ZZZZ_b_mul_r, ZPR4b8>;
+defm FVDOT_VG2_M2ZZI_BtoH : sme2p1_multi_vec_array_vg2_index_f8f16<"fvdot", 0b11, 0b110, ZZ_b_mul_r, ZPR4b8>;
+defm FDOT_VG2_M2ZZI_BtoH  : sme2p1_multi_vec_array_vg2_index_f8f16<"fdot",  0b11, 0b010, ZZ_b_mul_r, ZPR4b8>;
+defm FDOT_VG4_M4ZZI_BtoH  : sme2p1_multi_vec_array_vg4_index_f8f16<"fdot",    0b100, ZZZZ_b_mul_r, ZPR4b8>;
 defm FDOT_VG2_M2ZZ_BtoH   :  sme2_dot_mla_add_sub_array_vg24_single<"fdot", 0b0010001, MatrixOp16, ZZ_b, ZPR4b8>;
 defm FDOT_VG4_M4ZZ_BtoH   :  sme2_dot_mla_add_sub_array_vg24_single<"fdot", 0b0110001, MatrixOp16, ZZZZ_b, ZPR4b8>;
 // TODO: Replace nxv16i8 by nxv16f8
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index 44d9a8ac7cb677..7164362a76852a 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -2461,9 +2461,29 @@ multiclass sme2_multi_vec_array_vg2_index_32b<string mnemonic, bits<2> sz, bits<
 }
 
 // SME2.1 multi-vec ternary indexed two registers 16-bit
-// SME2 multi-vec indexed FP8 two-way dot product to FP16 two registers
 multiclass sme2p1_multi_vec_array_vg2_index_16b<string mnemonic, bits<2> sz, bits<3> op,
-                                                RegisterOperand multi_vector_ty, ZPRRegOp zpr_ty> {
+                                                RegisterOperand multi_vector_ty, ZPRRegOp vector_ty,
+                                                ValueType vt, SDPatternOperator intrinsic> {
+  def NAME : sme2_multi_vec_array_vg2_index<sz, {op{2},?,?,op{1-0},?}, MatrixOp16,
+                                            multi_vector_ty, vector_ty,
+                                            VectorIndexH, mnemonic>, SMEPseudo2Instr<NAME, 1> {
+    bits<3> i;
+    let Inst{11-10} = i{2-1};
+    let Inst{3}     = i{0};
+  }
+
+  def _PSEUDO : sme2_za_array_2op_multi_index_pseudo<NAME, sme_elm_idx0_7, multi_vector_ty, vector_ty, VectorIndexH32b_timm, SMEMatrixArray>;
+
+  def : SME2_ZA_TwoOp_VG2_Multi_Index_Pat<NAME, intrinsic, sme_elm_idx0_7, vector_ty, vt, VectorIndexH32b_timm, tileslice16>;
+
+  def : InstAlias<mnemonic # "\t$ZAda[$Rv, $imm3], $Zn, $Zm$i",
+        (!cast<Instruction>(NAME) MatrixOp16:$ZAda,  MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3,
+        multi_vector_ty:$Zn, vector_ty:$Zm, VectorIndexH:$i), 0>;
+}
+
+// SME2 multi-vec indexed FP8 two-way dot product to FP16 two registers
+multiclass sme2p1_multi_vec_array_vg2_index_f8f16<string mnemonic, bits<2> sz, bits<3> op,
+                                                  RegisterOperand multi_vector_ty, ZPRRegOp zpr_ty> {
   def NAME : sme2_multi_vec_array_vg2_index<sz, {op{2},?,?,op{1-0},?}, MatrixOp16,
                                             multi_vector_ty, zpr_ty,
                                             VectorIndexH, mnemonic> {
@@ -2582,10 +2602,10 @@ multiclass sme2_multi_vec_array_vg4_index_32b<string mnemonic, bits<4> op,
         multi_vector_ty:$Zn, vector_ty:$Zm, VectorIndexS32b_timm:$i), 0>;
 }
 
-// SME2.1 multi-vec ternary indexed four registers 16-bit
-multiclass sme2p1_multi_vec_array_vg4_index_16b<string mnemonic, bits<3> op,
-                                                RegisterOperand multi_vector_ty,
-                                                ZPRRegOp zpr_ty> {
+// SME2.1 multi-vec ternary indexed four registers 16-bit (FP8)
+multiclass sme2p1_multi_vec_array_vg4_index_f8f16<string mnemonic, bits<3> op,
+                                                  RegisterOperand multi_vector_ty,
+                                                  ZPRRegOp zpr_ty> {
   def NAME : sme2_multi_vec_array_vg4_index<0b0,{0b1,?,?,op,?}, MatrixOp16,
                                             multi_vector_ty, zpr_ty,
                                             VectorIndexH, mnemonic>{
@@ -2599,6 +2619,28 @@ multiclass sme2p1_multi_vec_array_vg4_index_16b<string mnemonic, bits<3> op,
         sme_elm_idx0_7:$imm3, multi_vector_ty:$Zn, zpr_ty:$Zm, VectorIndexH:$i), 0>;
 }
 
+// SME2.1 multi-vec ternary indexed four registers 16-bit
+multiclass sme2p1_multi_vec_array_vg4_index_16b<string mnemonic, bits<3> op,
+                                                RegisterOperand multi_vector_ty,
+                                                ZPRRegOp vector_ty, ValueType vt,
+                                                SDPatternOperator intrinsic> {
+  def NAME : sme2_multi_vec_array_vg4_index<0b0,{0b1,?,?,op,?}, MatrixOp16,
+                                            multi_vector_ty, vector_ty,
+                                            VectorIndexH, mnemonic>, SMEPseudo2Instr<NAME, 1> {
+    bits<3> i;
+    let Inst{11-10} = i{2-1};
+    let Inst{3}     = i{0};
+  }
+
+  def _PSEUDO : sme2_za_array_2op_multi_index_pseudo<NAME, sme_elm_idx0_7, multi_vector_ty, vector_ty, VectorIndexH32b_timm, SMEMatrixArray>;
+
+  def : SME2_ZA_TwoOp_VG4_Multi_Index_Pat<NAME, intrinsic, sme_elm_idx0_7, vector_ty, vt, VectorIndexH32b_timm, tileslice16>;
+
+  def : InstAlias<mnemonic # "\t$ZAda[$Rv, $imm3], $Zn, $Zm$i",
+        (!cast<Instruction>(NAME) MatrixOp16:$ZAda,  MatrixIndexGPR32Op8_11:$Rv,
+        sme_elm_idx0_7:$imm3, multi_vector_ty:$Zn, vector_ty:$Zm, VectorIndexH:$i), 0>;
+}
+
 // SME2 multi-vec ternary indexed four registers 64-bit
 class sme2_multi_vec_array_vg4_index_64b<bits<3> op,
                                          RegisterOperand multi_vector_ty,
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-fmlas16.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-fmlas16.ll
new file mode 100644
index 00000000000000..3e807b7e633844
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-fmlas16.ll
@@ -0,0 +1,462 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --filter-out "// kill:.*$" --version 4
+; RUN: llc -verify-machineinstrs < %s | FileCheck %s
+
+target triple = "aarch64-linux"
+
+define void @test_fmla_f16_vg2_single(i32 %slice, <vscale x 8 x half> %a0, <vscale x 8 x half> %a1, <vscale x 8 x half> %b) #0 {
+; CHECK-LABEL: test_fmla_f16_vg2_single:
+; CHECK:  // %bb.0:
+; CHECK:    mov w8, w0
+; CHECK:    fmla za.h[w8, 0, vgx2], { z0.h, z1.h }, z2.h
+; CHECK:    fmla za.h[w8, 7, vgx2], { z0.h, z1.h }, z2.h
+; CHECK:    ret
+  call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv8f16(i32 %slice, <vscale x 8 x half> %a0, <vscale x 8 x half> %a1, <vscale x 8 x half> %b)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv8f16(i32 %slice.7, <vscale x 8 x half> %a0, <vscale x 8 x half> %a1, <vscale x 8 x half> %b)
+  ret void
+}
+
+define void @test_fmla_f16_vg4_single(i32 %slice, <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+; CHECK-LABEL: test_fmla_f16_vg4_single:
+; CHECK:  // %bb.0:
+; CHECK:    mov w8, w0
+; CHECK:    fmla za.h[w8, 0, vgx4], { z0.h - z3.h }, z4.h
+; CHECK:    fmla za.h[w8, 7, vgx4], { z0.h - z3.h }, z4.h
+; CHECK:    ret
+                                      <vscale x 8 x half> %a2, <vscale x 8 x half> %a3, <vscale x 8 x half> %b) #0 {
+  call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv8f16(i32 %slice, <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+                                                        <vscale x 8 x half> %a2, <vscale x 8 x half> %a3, <vscale x 8 x half> %b)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv8f16(i32 %slice.7, <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+                                                        <vscale x 8 x half> %a2, <vscale x 8 x half> %a3, <vscale x 8 x half> %b)
+  ret void
+}
+
+define void @test_fmls_f16_vg2_single(i32 %slice, <vscale x 8 x half> %a0, <vscale x 8 x half> %a1, <vscale x 8 x half> %b) #0 {
+; CHECK-LABEL: test_fmls_f16_vg2_single:
+; CHECK:  // %bb.0:
+; CHECK:    mov w8, w0
+; CHECK:    fmls za.h[w8, 0, vgx2], { z0.h, z1.h }, z2.h
+; CHECK:    fmls za.h[w8, 7, vgx2], { z0.h, z1.h }, z2.h
+; CHECK:    ret
+  call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv8f16(i32 %slice, <vscale x 8 x half> %a0, <vscale x 8 x half> %a1, <vscale x 8 x half> %b)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv8f16(i32 %slice.7, <vscale x 8 x half> %a0, <vscale x 8 x half> %a1, <vscale x 8 x half> %b)
+  ret void
+}
+
+define void @test_fmls_f16_vg4_single(i32 %slice, <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+; CHECK-LABEL: test_fmls_f16_vg4_single:
+; CHECK:  // %bb.0:
+; CHECK:    mov w8, w0
+; CHECK:    fmls za.h[w8, 0, vgx4], { z0.h - z3.h }, z4.h
+; CHECK:    fmls za.h[w8, 7, vgx4], { z0.h - z3.h }, z4.h
+; CHECK:    ret
+                                      <vscale x 8 x half> %a2, <vscale x 8 x half> %a3, <vscale x 8 x half> %b) #0 {
+  call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv8f16(i32 %slice, <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+                                                        <vscale x 8 x half> %a2, <vscale x 8 x half> %a3, <vscale x 8 x half> %b)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv8f16(i32 %slice.7, <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+                                                        <vscale x 8 x half> %a2, <vscale x 8 x half> %a3, <vscale x 8 x half> %b)
+  ret void
+}
+
+define void @test_fmla_f16_vg2_multi(i32 %slice,
+; CHECK-LABEL: test_fmla_f16_vg2_multi:
+; CHECK:  // %bb.0:
+; CHECK:    mov w8, w0
+; CHECK:    fmla za.h[w8, 0, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
+; CHECK:    fmla za.h[w8, 7, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
+; CHECK:    ret
+                                     <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+                                     <vscale x 8 x half> %b0, <vscale x 8 x half> %b1) #0 {
+  call void @llvm.aarch64.sme.fmla.vg1x2.nxv8f16(i32 %slice,
+                                                <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+                                                <vscale x 8 x half> %b0, <vscale x 8 x half> %b1)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.fmla.vg1x2.nxv8f16(i32 %slice.7,
+                                                 <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+                                                 <vscale x 8 x half> %b0, <vscale x 8 x half> %b1)
+  ret void
+}
+
+define void @test_fmla_f16_vg4_multi(i32 %slice,
+; CHECK-LABEL: test_fmla_f16_vg4_multi:
+; CHECK:  // %bb.0:
+; CHECK:    mov w8, w0
+; CHECK:    fmla za.h[w8, 0, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
+; CHECK:    fmla za.h[w8, 7, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
+; CHECK:    ret
+                                     <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+                                     <vscale x 8 x half> %a2, <vscale x 8 x half> %a3,
+                                     <vscale x 8 x half> %b0, <vscale x 8 x half> %b1,
+                                     <vscale x 8 x half> %b2, <vscale x 8 x half> %b3) #0 {
+  call void @llvm.aarch64.sme.fmla.vg1x4.nxv8f16(i32 %slice,
+                                                 <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+                                                 <vscale x 8 x half> %a2, <vscale x 8 x half> %a3,
+                                                 <vscale x 8 x half> %b0, <vscale x 8 x half> %b1,
+                                                 <vscale x 8 x half> %b2, <vscale x 8 x half> %b3)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.fmla.vg1x4.nxv8f16(i32 %slice.7,
+                                                 <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+                                                 <vscale x 8 x half> %a2, <vscale x 8 x half> %a3,
+                                                 <vscale x 8 x half> %b0, <vscale x 8 x half> %b1,
+                                                 <vscale x 8 x half> %b2, <vscale x 8 x half> %b3)
+  ret void
+}
+
+define void @test_fmls_f16_vg2_multi(i32 %slice,
+; CHECK-LABEL: test_fmls_f16_vg2_multi:
+; CHECK:  // %bb.0:
+; CHECK:    mov w8, w0
+; CHECK:    fmls za.h[w8, 0, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
+; CHECK:    fmls za.h[w8, 7, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
+; CHECK:    ret
+                                     <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+                                     <vscale x 8 x half> %b0, <vscale x 8 x half> %b1) #0 {
+  call void @llvm.aarch64.sme.fmls.vg1x2.nxv8f16(i32 %slice,
+                                                <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+                                                <vscale x 8 x half> %b0, <vscale x 8 x half> %b1)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.fmls.vg1x2.nxv8f16(i32 %slice.7,
+                                                 <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+                                                 <vscale x 8 x half> %b0, <vscale x 8 x half> %b1)
+  ret void
+}
+
+define void @test_fmls_f16_vg4_multi(i32 %slice,
+; CHECK-LABEL: test_fmls_f16_vg4_multi:
+; CHECK:  // %bb.0:
+; CHECK:    mov w8, w0
+; CHECK:    fmls za.h[w8, 0, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
+; CHECK:    fmls za.h[w8, 7, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
+; CHECK:    ret
+                                     <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+                                     <vscale x 8 x half> %a2, <vscale x 8 x half> %a3,
+                                     <vscale x 8 x half> %b0, <vscale x 8 x half> %b1,
+                                     <vscale x 8 x half> %b2, <vscale x 8 x half> %b3) #0 {
+  call void @llvm.aarch64.sme.fmls.vg1x4.nxv8f16(i32 %slice,
+                                                 <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+                                                 <vscale x 8 x half> %a2, <vscale x 8 x half> %a3,
+                                                 <vscale x 8 x half> %b0, <vscale x 8 x half> %b1,
+                                                 <vscale x 8 x half> %b2, <vscale x 8 x half> %b3)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.fmls.vg1x4.nxv8f16(i32 %slice.7,
+                                                 <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+                                                 <vscale x 8 x half> %a2, <vscale x 8 x half> %a3,
+                                                 <vscale x 8 x half> %b0, <vscale x 8 x half> %b1,
+                                                 <vscale x 8 x half> %b2, <vscale x 8 x half> %b3)
+  ret void
+}
+
+define void @test_fmla_f16_vg2_index(i32 %slice,
+; CHECK-LABEL: test_fmla_f16_vg2_index:
+; CHECK:  // %bb.0:
+; CHECK:    mov w8, w0
+; CHECK:    fmla za.h[w8, 0, vgx2], { z0.h, z1.h }, z2.h[7]
+; CHECK:    fmla za.h[w8, 7, vgx2], { z0.h, z1.h }, z2.h[7]
+; CHECK:    ret
+                                     <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+                                     <vscale x 8 x half> %b) #0 {
+  call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv8f16(i32 %slice,
+                                                      <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+                                                      <vscale x 8 x half> %b, i32 7);
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv8f16(i32 %slice.7,
+                                                     <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+                                                     <vscale x 8 x half> %b, i32 7);
+  ret void
+}
+
+define void @test_fmla_f16_vg4_index(i32 %slice,
+; CHECK-LABEL: test_fmla_f16_vg4_index:
+; CHECK:  // %bb.0:
+; CHECK:    mov w8, w0
+; CHECK:    fmla za.h[w8, 0, vgx4], { z0.h - z3.h }, z4.h[7]
+; CHECK:    fmla za.h[w8, 0, vgx4], { z0.h - z3.h }, z4.h[7]
+; CHECK:    ret
+                                     <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+                                     <vscale x 8 x half> %a2, <vscale x 8 x half> %a3,
+                                     <vscale x 8 x half> %b) #0 {
+  call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv8f16(i32 %slice,
+                                                      <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+                                                      <vscale x 8 x half> %a2, <vscale x 8 x half> %a3,
+                                                      <vscale x 8 x half> %b, i32 7);
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv8f16(i32 %slice,
+                                                      <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+                                                      <vscale x 8 x half> %a2, <vscale x 8 x half> %a3,
+                                                      <vscale x 8 x half> %b, i32 7);
+  ret void
+}
+
+define void @test_fmls_f16_vg2_index(i32 %slice,
+; CHECK-LABEL: test_fmls_f16_vg2_index:
+; CHECK:  // %bb.0:
+; CHECK:    mov w8, w0
+; CHECK:    fmls za.h[w8, 0, vgx2], { z0.h, z1.h }, z2.h[7]
+; CHECK:    fmls za.h[w8, 7, vgx2], { z0.h, z1.h }, z2.h[7]
+; CHECK:    ret
+                                     <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+                                     <vscale x 8 x half> %b) #0 {
+  call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv8f16(i32 %slice,
+                                                      <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+                                                      <vscale x 8 x half> %b, i32 7);
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv8f16(i32 %slice.7,
+                                                     <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+                                                     <vscale x 8 x half> %b, i32 7);
+  ret void
+}
+
+define void @test_fmls_f16_vg4_index(i32 %slice,
+; CHECK-LABEL: test_fmls_f16_vg4_index:
+; CHECK:  // %bb.0:
+; CHECK:    mov w8, w0
+; CHECK:    fmls za.h[w8, 0, vgx4], { z0.h - z3.h }, z4.h[7]
+; CHECK:    fmls za.h[w8, 0, vgx4], { z0.h - z3.h }, z4.h[7]
+; CHECK:    ret
+                                     <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+                                     <vscale x 8 x half> %a2, <vscale x 8 x half> %a3,
+                                     <vscale x 8 x half> %b) #0 {
+  call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv8f16(i32 %slice,
+                                                      <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+                                                      <vscale x 8 x half> %a2, <vscale x 8 x half> %a3,
+                                                      <vscale x 8 x half> %b, i32 7);
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv8f16(i32 %slice,
+                                                      <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+                                                      <vscale x 8 x half> %a2, <vscale x 8 x half> %a3,
+                                                      <vscale x 8 x half> %b, i32 7);
+  ret void
+}
+
+define void @test_fmla_bf16_vg2_single(i32 %slice, <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1, <vscale x 8 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fmla_bf16_vg2_single:
+; CHECK:  // %bb.0:
+; CHECK:    mov w8, w0
+; CHECK:    bfmla za.h[w8, 0, vgx2], { z0.h, z1.h }, z2.h
+; CHECK:    bfmla za.h[w8, 7, vgx2], { z0.h, z1.h }, z2.h
+; CHECK:    ret
+  call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1, <vscale x 8 x bfloat> %b)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv8bf16(i32 %slice.7, <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1, <vscale x 8 x bfloat> %b)
+  ret void
+}
+
+define void @test_fmla_bf16_vg4_single(i32 %slice, <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+; CHECK-LABEL: test_fmla_bf16_vg4_single:
+; CHECK:  // %bb.0:
+; CHECK:    mov w8, w0
+; CHECK:    bfmla za.h[w8, 0, vgx4], { z0.h - z3.h }, z4.h
+; CHECK:    bfmla za.h[w8, 7, vgx4], { z0.h - z3.h }, z4.h
+; CHECK:    ret
+                                       <vscale x 8 x bfloat> %a2, <vscale x 8 x bfloat> %a3, <vscale x 8 x bfloat> %b) #0 {
+  call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+                                                         <vscale x 8 x bfloat> %a2, <vscale x 8 x bfloat> %a3, <vscale x 8 x bfloat> %b)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv8bf16(i32 %slice.7, <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+                                                         <vscale x 8 x bfloat> %a2, <vscale x 8 x bfloat> %a3, <vscale x 8 x bfloat> %b)
+  ret void
+}
+
+define void @test_fmls_bf16_vg2_single(i32 %slice, <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1, <vscale x 8 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fmls_bf16_vg2_single:
+; CHECK:  // %bb.0:
+; CHECK:    mov w8, w0
+; CHECK:    bfmls za.h[w8, 0, vgx2], { z0.h, z1.h }, z2.h
+; CHECK:    bfmls za.h[w8, 7, vgx2], { z0.h, z1.h }, z2.h
+; CHECK:    ret
+  call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1, <vscale x 8 x bfloat> %b)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv8bf16(i32 %slice.7, <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1, <vscale x 8 x bfloat> %b)
+  ret void
+}
+
+define void @test_fmls_bf16_vg4_single(i32 %slice, <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+; CHECK-LABEL: test_fmls_bf16_vg4_single:
+; CHECK:  // %bb.0:
+; CHECK:    mov w8, w0
+; CHECK:    bfmls za.h[w8, 0, vgx4], { z0.h - z3.h }, z4.h
+; CHECK:    bfmls za.h[w8, 7, vgx4], { z0.h - z3.h }, z4.h
+; CHECK:    ret
+                                       <vscale x 8 x bfloat> %a2, <vscale x 8 x bfloat> %a3, <vscale x 8 x bfloat> %b) #0 {
+  call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+                                                         <vscale x 8 x bfloat> %a2, <vscale x 8 x bfloat> %a3, <vscale x 8 x bfloat> %b)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv8bf16(i32 %slice.7, <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+                                                         <vscale x 8 x bfloat> %a2, <vscale x 8 x bfloat> %a3, <vscale x 8 x bfloat> %b)
+  ret void
+}
+
+define void @test_fmla_bf16_vg2_multi(i32 %slice,
+; CHECK-LABEL: test_fmla_bf16_vg2_multi:
+; CHECK:  // %bb.0:
+; CHECK:    mov w8, w0
+; CHECK:    bfmla za.h[w8, 0, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
+; CHECK:    bfmla za.h[w8, 7, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
+; CHECK:    ret
+                                      <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+                                      <vscale x 8 x bfloat> %b0, <vscale x 8 x bfloat> %b1) #0 {
+  call void @llvm.aarch64.sme.fmla.vg1x2.nxv8bf16(i32 %slice,
+                                                 <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+                                                 <vscale x 8 x bfloat> %b0, <vscale x 8 x bfloat> %b1)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.fmla.vg1x2.nxv8bf16(i32 %slice.7,
+                                                  <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+                                                  <vscale x 8 x bfloat> %b0, <vscale x 8 x bfloat> %b1)
+  ret void
+}
+
+define void @test_fmla_bf16_vg4_multi(i32 %slice,
+; CHECK-LABEL: test_fmla_bf16_vg4_multi:
+; CHECK:  // %bb.0:
+; CHECK:    mov w8, w0
+; CHECK:    bfmla za.h[w8, 0, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
+; CHECK:    bfmla za.h[w8, 7, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
+; CHECK:    ret
+                                      <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+                                      <vscale x 8 x bfloat> %a2, <vscale x 8 x bfloat> %a3,
+                                      <vscale x 8 x bfloat> %b0, <vscale x 8 x bfloat> %b1,
+                                      <vscale x 8 x bfloat> %b2, <vscale x 8 x bfloat> %b3) #0 {
+  call void @llvm.aarch64.sme.fmla.vg1x4.nxv8bf16(i32 %slice,
+                                                  <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+                                                  <vscale x 8 x bfloat> %a2, <vscale x 8 x bfloat> %a3,
+                                                  <vscale x 8 x bfloat> %b0, <vscale x 8 x bfloat> %b1,
+                                                  <vscale x 8 x bfloat> %b2, <vscale x 8 x bfloat> %b3)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.fmla.vg1x4.nxv8bf16(i32 %slice.7,
+                                                  <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+                                                  <vscale x 8 x bfloat> %a2, <vscale x 8 x bfloat> %a3,
+                                                  <vscale x 8 x bfloat> %b0, <vscale x 8 x bfloat> %b1,
+                                                  <vscale x 8 x bfloat> %b2, <vscale x 8 x bfloat> %b3)
+  ret void
+}
+
+define void @test_fmls_bf16_vg2_multi(i32 %slice,
+; CHECK-LABEL: test_fmls_bf16_vg2_multi:
+; CHECK:  // %bb.0:
+; CHECK:    mov w8, w0
+; CHECK:    bfmls za.h[w8, 0, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
+; CHECK:    bfmls za.h[w8, 7, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
+; CHECK:    ret
+                                     <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+                                     <vscale x 8 x bfloat> %b0, <vscale x 8 x bfloat> %b1) #0 {
+  call void @llvm.aarch64.sme.fmls.vg1x2.nxv8bf16(i32 %slice,
+                                                <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+                                                <vscale x 8 x bfloat> %b0, <vscale x 8 x bfloat> %b1)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.fmls.vg1x2.nxv8bf16(i32 %slice.7,
+                                                 <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+                                                 <vscale x 8 x bfloat> %b0, <vscale x 8 x bfloat> %b1)
+  ret void
+}
+
+define void @test_fmls_bf16_vg4_multi(i32 %slice,
+; CHECK-LABEL: test_fmls_bf16_vg4_multi:
+; CHECK:  // %bb.0:
+; CHECK:    mov w8, w0
+; CHECK:    bfmls za.h[w8, 0, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
+; CHECK:    bfmls za.h[w8, 7, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
+; CHECK:    ret
+                                     <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+                                     <vscale x 8 x bfloat> %a2, <vscale x 8 x bfloat> %a3,
+                                     <vscale x 8 x bfloat> %b0, <vscale x 8 x bfloat> %b1,
+                                     <vscale x 8 x bfloat> %b2, <vscale x 8 x bfloat> %b3) #0 {
+  call void @llvm.aarch64.sme.fmls.vg1x4.nxv8bf16(i32 %slice,
+                                                 <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+                                                 <vscale x 8 x bfloat> %a2, <vscale x 8 x bfloat> %a3,
+                                                 <vscale x 8 x bfloat> %b0, <vscale x 8 x bfloat> %b1,
+                                                 <vscale x 8 x bfloat> %b2, <vscale x 8 x bfloat> %b3)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.fmls.vg1x4.nxv8bf16(i32 %slice.7,
+                                                 <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+                                                 <vscale x 8 x bfloat> %a2, <vscale x 8 x bfloat> %a3,
+                                                 <vscale x 8 x bfloat> %b0, <vscale x 8 x bfloat> %b1,
+                                                 <vscale x 8 x bfloat> %b2, <vscale x 8 x bfloat> %b3)
+  ret void
+}
+
+define void @test_fmla_bf16_vg2_index(i32 %slice,
+; CHECK-LABEL: test_fmla_bf16_vg2_index:
+; CHECK:  // %bb.0:
+; CHECK:    mov w8, w0
+; CHECK:    bfmla za.h[w8, 0, vgx2], { z0.h, z1.h }, z2.h[7]
+; CHECK:    bfmla za.h[w8, 7, vgx2], { z0.h, z1.h }, z2.h[7]
+; CHECK:    ret
+                                      <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+                                      <vscale x 8 x bfloat> %b) #0 {
+  call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv8bf16(i32 %slice,
+                                                       <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+                                                       <vscale x 8 x bfloat> %b, i32 7);
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv8bf16(i32 %slice.7,
+                                                       <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+                                                       <vscale x 8 x bfloat> %b, i32 7);
+  ret void
+}
+
+define void @test_fmla_bf16_vg4_index(i32 %slice,
+; CHECK-LABEL: test_fmla_bf16_vg4_index:
+; CHECK:  // %bb.0:
+; CHECK:    mov w8, w0
+; CHECK:    bfmla za.h[w8, 0, vgx4], { z0.h - z3.h }, z4.h[7]
+; CHECK:    bfmla za.h[w8, 0, vgx4], { z0.h - z3.h }, z4.h[7]
+; CHECK:    ret
+                                      <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+                                      <vscale x 8 x bfloat> %a2, <vscale x 8 x bfloat> %a3,
+                                      <vscale x 8 x bfloat> %b) #0 {
+  call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv8bf16(i32 %slice,
+                                                       <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+                                                       <vscale x 8 x bfloat> %a2, <vscale x 8 x bfloat> %a3,
+                                                       <vscale x 8 x bfloat> %b, i32 7);
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv8bf16(i32 %slice,
+                                                       <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+                                                       <vscale x 8 x bfloat> %a2, <vscale x 8 x bfloat> %a3,
+                                                       <vscale x 8 x bfloat> %b, i32 7);
+  ret void
+}
+
+define void @test_fmls_bf16_vg2_index(i32 %slice,
+; CHECK-LABEL: test_fmls_bf16_vg2_index:
+; CHECK:  // %bb.0:
+; CHECK:    mov w8, w0
+; CHECK:    bfmls za.h[w8, 0, vgx2], { z0.h, z1.h }, z2.h[7]
+; CHECK:    bfmls za.h[w8, 7, vgx2], { z0.h, z1.h }, z2.h[7]
+; CHECK:    ret
+                                      <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+                                      <vscale x 8 x bfloat> %b) #0 {
+  call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv8bf16(i32 %slice,
+                                                       <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+                                                       <vscale x 8 x bfloat> %b, i32 7);
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv8bf16(i32 %slice.7,
+                                                      <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+                                                      <vscale x 8 x bfloat> %b, i32 7);
+  ret void
+}
+
+define void @test_fmls_bf16_vg4_index(i32 %slice,
+; CHECK-LABEL: test_fmls_bf16_vg4_index:
+; CHECK:  // %bb.0:
+; CHECK:    mov w8, w0
+; CHECK:    bfmls za.h[w8, 0, vgx4], { z0.h - z3.h }, z4.h[7]
+; CHECK:    bfmls za.h[w8, 0, vgx4], { z0.h - z3.h }, z4.h[7]
+; CHECK:    ret
+                                      <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+                                      <vscale x 8 x bfloat> %a2, <vscale x 8 x bfloat> %a3,
+                                      <vscale x 8 x bfloat> %b) #0 {
+  call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv8bf16(i32 %slice,
+                                                       <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+                                                       <vscale x 8 x bfloat> %a2, <vscale x 8 x bfloat> %a3,
+                                                       <vscale x 8 x bfloat> %b, i32 7);
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv8bf16(i32 %slice,
+                                                       <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+                                                       <vscale x 8 x bfloat> %a2, <vscale x 8 x bfloat> %a3,
+                                                       <vscale x 8 x bfloat> %b, i32 7);
+  ret void
+}
+
+attributes #0 = { nounwind "target-features"="+sme2p1,+sme-f16f16,+b16b16" }