[clang] [AArch64][SME2] Add SME2 MLA/MLS builtins. (PR #75584)

Thu Dec 21 03:13:29 PST 2023

https://github.com/dtemirbulatov updated https://github.com/llvm/llvm-project/pull/75584

>From c3275cbd83196fea04c5f23e4de974f816e700e7 Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <Dinar.Temirbulatov at arm.com>
Date: Fri, 15 Dec 2023 02:00:31 +0000
Subject: [PATCH 1/7] [AArch64][SME2] Add SME2 MLA/MLS builtins.

Add SME2 MLA/MLS builtins.
---
 clang/include/clang/Basic/arm_sme.td          |  217 ++
 clang/lib/CodeGen/CGBuiltin.cpp               |   25 +
 .../aarch64-sme2-intrinsics/acle_sme2_mlal.c  |  760 +++++++
 .../aarch64-sme2-intrinsics/acle_sme2_mlall.c | 1950 +++++++++++++++++
 .../aarch64-sme2-intrinsics/acle_sme2_mlsl.c  |  760 +++++++
 5 files changed, 3712 insertions(+)
 create mode 100644 clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlal.c
 create mode 100644 clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlall.c
 create mode 100644 clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlsl.c

diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index ce99ca82c1d396..bda493f58441a8 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -353,6 +353,223 @@ let TargetGuard = "sme2" in {
   def SVBMOPS : Inst<"svbmops_za32[_{d}]_m", "viPPdd", "iUi", MergeNone, "aarch64_sme_bmops_za32", [IsSharedZA, IsStreaming], [ImmCheck<0, ImmCheck0_3>]>;
 }
 
+// FMLA/FMLS
+let TargetGuard = "sme2" in {
+  def SVMLA_MULTI_VG1x2_F32 : Inst<"svmla_za32[_{d}]_vg1x2", "vm22", "f", MergeNone, "aarch64_sme_fmla_vg1x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLA_MULTI_VG1x4_F32 : Inst<"svmla_za32[_{d}]_vg1x4", "vm44", "f", MergeNone, "aarch64_sme_fmla_vg1x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLS_MULTI_VG1x2_F32 : Inst<"svmls_za32[_{d}]_vg1x2", "vm22", "f", MergeNone, "aarch64_sme_fmls_vg1x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLS_MULTI_VG1x4_F32 : Inst<"svmls_za32[_{d}]_vg1x4", "vm44", "f", MergeNone, "aarch64_sme_fmls_vg1x4", [IsStreaming, IsSharedZA], []>;
+
+  def SVMLA_SINGLE_VG1x2_F32 : Inst<"svmla[_single]_za32[_{d}]_vg1x2", "vm2d", "f", MergeNone, "aarch64_sme_fmla_single_vg1x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLA_SINGLE_VG1x4_F32 : Inst<"svmla[_single]_za32[_{d}]_vg1x4", "vm4d", "f", MergeNone, "aarch64_sme_fmla_single_vg1x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLS_SINGLE_VG1x2_F32 : Inst<"svmls[_single]_za32[_{d}]_vg1x2", "vm2d", "f", MergeNone, "aarch64_sme_fmls_single_vg1x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLS_SINGLE_VG1x4_F32 : Inst<"svmls[_single]_za32[_{d}]_vg1x4", "vm4d", "f", MergeNone, "aarch64_sme_fmls_single_vg1x4", [IsStreaming, IsSharedZA], []>;
+
+  def SVMLA_LANE_VG1x2_F32 : Inst<"svmla_lane_za32[_{d}]_vg1x2", "vm2di", "f", MergeNone, "aarch64_sme_fmla_lane_vg1x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVMLA_LANE_VG1x4_F32 : Inst<"svmla_lane_za32[_{d}]_vg1x4", "vm4di", "f", MergeNone, "aarch64_sme_fmla_lane_vg1x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVMLS_LANE_VG1x2_F32 : Inst<"svmls_lane_za32[_{d}]_vg1x2", "vm2di", "f", MergeNone, "aarch64_sme_fmls_lane_vg1x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVMLS_LANE_VG1x4_F32 : Inst<"svmls_lane_za32[_{d}]_vg1x4", "vm4di", "f", MergeNone, "aarch64_sme_fmls_lane_vg1x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
+}
+
+let TargetGuard = "sme2,sme-f64f64" in {
+  def SVMLA_MULTI_VG1x2_F64 : Inst<"svmla_za64[_{d}]_vg1x2", "vm22", "d", MergeNone, "aarch64_sme_fmla_vg1x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLA_MULTI_VG1x4_F64 : Inst<"svmla_za64[_{d}]_vg1x4", "vm44", "d", MergeNone, "aarch64_sme_fmla_vg1x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLS_MULTI_VG1x2_F64 : Inst<"svmls_za64[_{d}]_vg1x2", "vm22", "d", MergeNone, "aarch64_sme_fmls_vg1x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLS_MULTI_VG1x4_F64 : Inst<"svmls_za64[_{d}]_vg1x4", "vm44", "d", MergeNone, "aarch64_sme_fmls_vg1x4", [IsStreaming, IsSharedZA], []>;
+
+  def SVMLA_SINGLE_VG1x2_F64 : Inst<"svmla[_single]_za64[_{d}]_vg1x2", "vm2d", "d", MergeNone, "aarch64_sme_fmla_single_vg1x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLA_SINGLE_VG1x4_F64 : Inst<"svmla[_single]_za64[_{d}]_vg1x4", "vm4d", "d", MergeNone, "aarch64_sme_fmla_single_vg1x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLS_SINGLE_VG1x2_F64 : Inst<"svmls[_single]_za64[_{d}]_vg1x2", "vm2d", "d", MergeNone, "aarch64_sme_fmls_single_vg1x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLS_SINGLE_VG1x4_F64 : Inst<"svmls[_single]_za64[_{d}]_vg1x4", "vm4d", "d", MergeNone, "aarch64_sme_fmls_single_vg1x4", [IsStreaming, IsSharedZA], []>;
+
+  def SVMLA_LANE_VG1x2_F64 : Inst<"svmla_lane_za64[_{d}]_vg1x2", "vm2di", "d", MergeNone, "aarch64_sme_fmla_lane_vg1x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_1>]>;
+  def SVMLA_LANE_VG1x4_F64 : Inst<"svmla_lane_za64[_{d}]_vg1x4", "vm4di", "d", MergeNone, "aarch64_sme_fmla_lane_vg1x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_1>]>;
+  def SVMLS_LANE_VG1x2_F64 : Inst<"svmls_lane_za64[_{d}]_vg1x2", "vm2di", "d", MergeNone, "aarch64_sme_fmls_lane_vg1x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_1>]>;
+  def SVMLS_LANE_VG1x4_F64 : Inst<"svmls_lane_za64[_{d}]_vg1x4", "vm4di", "d", MergeNone, "aarch64_sme_fmls_lane_vg1x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_1>]>;
+}
+
+// FMLAL/FMLSL/UMLAL/SMLAL
+// SMLALL/UMLALL/USMLALL/SUMLALL
+let TargetGuard = "sme2" in {
+  // MULTI MLAL
+  def SVMLAL_MULTI_VG2x2_F16 : Inst<"svmla_za32[_{d}]_vg2x2", "vm22", "bh", MergeNone, "aarch64_sme_fmlal_vg2x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_MULTI_VG2x4_F16 : Inst<"svmla_za32[_{d}]_vg2x4", "vm44", "bh", MergeNone, "aarch64_sme_fmlal_vg2x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_MULTI_VG2x2_S16 : Inst<"svmla_za32[_{d}]_vg2x2", "vm22", "s", MergeNone, "aarch64_sme_smlal_vg2x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_MULTI_VG2x4_S16 : Inst<"svmla_za32[_{d}]_vg2x4", "vm44", "s", MergeNone, "aarch64_sme_smlal_vg2x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_MULTI_VG2x2_U16 : Inst<"svmla_za32[_{d}]_vg2x2", "vm22", "Us", MergeNone, "aarch64_sme_umlal_vg2x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_MULTI_VG2x4_U16 : Inst<"svmla_za32[_{d}]_vg2x4", "vm44", "Us", MergeNone, "aarch64_sme_umlal_vg2x4", [IsStreaming, IsSharedZA], []>;
+
+  def SVMLAL_MULTI_VG4x2_S8 : Inst<"svmla_za32[_{d}]_vg4x2", "vm22", "c", MergeNone, "aarch64_sme_smla_za32_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_MULTI_VG4x2_U8 : Inst<"svmla_za32[_{d}]_vg4x2", "vm22", "Uc", MergeNone, "aarch64_sme_umla_za32_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_MULTI_VG4x4_S8 : Inst<"svmla_za32[_{d}]_vg4x4", "vm44", "c", MergeNone, "aarch64_sme_smla_za32_vg4x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_MULTI_VG4x4_U8 : Inst<"svmla_za32[_{d}]_vg4x4", "vm44", "Uc", MergeNone, "aarch64_sme_umla_za32_vg4x4", [IsStreaming, IsSharedZA], []>;
+
+  let TargetGuard = "sme-i16i64" in {
+    def SVMLAL_MULTI_VG4x2_S16 : Inst<"svmla_za64[_{d}]_vg4x2", "vm22", "s", MergeNone, "aarch64_sme_smla_za64_vg4x2", [IsStreaming, IsSharedZA], []>;
+    def SVMLAL_MULTI_VG4x2_U16 : Inst<"svmla_za64[_{d}]_vg4x2", "vm22", "Us", MergeNone, "aarch64_sme_umla_za64_vg4x2", [IsStreaming, IsSharedZA], []>;
+    def SVMLAL_MULTI_VG4x4_S16 : Inst<"svmla_za64[_{d}]_vg4x4", "vm44", "s", MergeNone, "aarch64_sme_smla_za64_vg4x4", [IsStreaming, IsSharedZA], []>;
+    def SVMLAL_MULTI_VG4x4_U16 : Inst<"svmla_za64[_{d}]_vg4x4", "vm44", "Us", MergeNone, "aarch64_sme_umla_za64_vg4x4", [IsStreaming, IsSharedZA], []>;
+  }
+
+  // MULTI MLSL
+  def SVMLSL_MULTI_VG2x2_F16 : Inst<"svmls_za32[_{d}]_vg2x2", "vm22", "bh", MergeNone, "aarch64_sme_fmlsl_vg2x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_MULTI_VG2x4_F16 : Inst<"svmls_za32[_{d}]_vg2x4", "vm44", "bh", MergeNone, "aarch64_sme_fmlsl_vg2x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_MULTI_VG2x2_S16 : Inst<"svmls_za32[_{d}]_vg2x2", "vm22", "s", MergeNone, "aarch64_sme_smlsl_vg2x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_MULTI_VG2x4_S16 : Inst<"svmls_za32[_{d}]_vg2x4", "vm44", "s", MergeNone, "aarch64_sme_smlsl_vg2x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_MULTI_VG2x2_U16 : Inst<"svmls_za32[_{d}]_vg2x2", "vm22", "Us", MergeNone, "aarch64_sme_umlsl_vg2x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_MULTI_VG2x4_U16 : Inst<"svmls_za32[_{d}]_vg2x4", "vm44", "Us", MergeNone, "aarch64_sme_umlsl_vg2x4", [IsStreaming, IsSharedZA], []>;
+
+  def SVMLSL_MULTI_VG4x2_S8 : Inst<"svmls_za32[_{d}]_vg4x2", "vm22", "c", MergeNone, "aarch64_sme_smls_za32_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_MULTI_VG4x2_U8 : Inst<"svmls_za32[_{d}]_vg4x2", "vm22", "Uc", MergeNone, "aarch64_sme_umls_za32_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_MULTI_VG4x4_S8 : Inst<"svmls_za32[_{d}]_vg4x4", "vm44", "c", MergeNone, "aarch64_sme_smls_za32_vg4x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_MULTI_VG4x4_U8 : Inst<"svmls_za32[_{d}]_vg4x4", "vm44", "Uc", MergeNone, "aarch64_sme_umls_za32_vg4x4", [IsStreaming, IsSharedZA], []>;
+
+  let TargetGuard = "sme-i16i64" in {
+    def SVMLSL_MULTI_VG4x2_S16 : Inst<"svmls_za64[_{d}]_vg4x2", "vm22", "s", MergeNone, "aarch64_sme_smls_za64_vg4x2", [IsStreaming, IsSharedZA], []>;
+    def SVMLSL_MULTI_VG4x2_U16 : Inst<"svmls_za64[_{d}]_vg4x2", "vm22", "Us", MergeNone, "aarch64_sme_umls_za64_vg4x2", [IsStreaming, IsSharedZA], []>;
+    def SVMLSL_MULTI_VG4x4_S16 : Inst<"svmls_za64[_{d}]_vg4x4", "vm44", "s", MergeNone, "aarch64_sme_smls_za64_vg4x4", [IsStreaming, IsSharedZA], []>;
+    def SVMLSL_MULTI_VG4x4_U16 : Inst<"svmls_za64[_{d}]_vg4x4", "vm44", "Us", MergeNone, "aarch64_sme_umls_za64_vg4x4", [IsStreaming, IsSharedZA], []>;
+  }
+
+  // SINGLE MLAL
+  def SVMLAL_SINGLE_VG2x1_F16 : Inst<"svmla_za32[_{d}]_vg2x1",          "vmdd", "bh", MergeNone, "aarch64_sme_fmlal_single_vg2x1", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_SINGLE_VG2x2_F16 : Inst<"svmla[_single]_za32[_{d}]_vg2x2", "vm2d", "bh", MergeNone, "aarch64_sme_fmlal_single_vg2x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_SINGLE_VG2x4_F16 : Inst<"svmla[_single]_za32[_{d}]_vg2x4", "vm4d", "bh", MergeNone, "aarch64_sme_fmlal_single_vg2x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_SINGLE_VG2x1_S16 : Inst<"svmla_za32[_{d}]_vg2x1",          "vmdd", "s", MergeNone, "aarch64_sme_smlal_single_vg2x1", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_SINGLE_VG2x2_S16 : Inst<"svmla[_single]_za32[_{d}]_vg2x2", "vm2d", "s", MergeNone, "aarch64_sme_smlal_single_vg2x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_SINGLE_VG2x4_S16 : Inst<"svmla[_single]_za32[_{d}]_vg2x4", "vm4d", "s", MergeNone, "aarch64_sme_smlal_single_vg2x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_SINGLE_VG2x1_U16 : Inst<"svmla_za32[_{d}]_vg2x1",          "vmdd", "Us", MergeNone, "aarch64_sme_umlal_single_vg2x1", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_SINGLE_VG2x2_U16 : Inst<"svmla[_single]_za32[_{d}]_vg2x2", "vm2d", "Us", MergeNone, "aarch64_sme_umlal_single_vg2x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_SINGLE_VG2x4_U16 : Inst<"svmla[_single]_za32[_{d}]_vg2x4", "vm4d", "Us", MergeNone, "aarch64_sme_umlal_single_vg2x4", [IsStreaming, IsSharedZA], []>;
+
+  def SVMLAL_SINGLE_VG4x1_S8  : Inst<"svmla_za32[_{d}]_vg4x1",          "vmdd", "c", MergeNone, "aarch64_sme_smla_za32_single_vg4x1", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_SINGLE_VG4x1_U8  : Inst<"svmla_za32[_{d}]_vg4x1",          "vmdd", "Uc", MergeNone, "aarch64_sme_umla_za32_single_vg4x1", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_SINGLE_VG4x2_S8  : Inst<"svmla[_single]_za32[_{d}]_vg4x2", "vm2d", "c", MergeNone, "aarch64_sme_smla_za32_single_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_SINGLE_VG4x2_U8  : Inst<"svmla[_single]_za32[_{d}]_vg4x2", "vm2d", "Uc", MergeNone, "aarch64_sme_umla_za32_single_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_SINGLE_VG4x4_S8  : Inst<"svmla[_single]_za32[_{d}]_vg4x4", "vm4d", "c", MergeNone, "aarch64_sme_smla_za32_single_vg4x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_SINGLE_VG4x4_U8  : Inst<"svmla[_single]_za32[_{d}]_vg4x4", "vm4d", "Uc", MergeNone, "aarch64_sme_umla_za32_single_vg4x4", [IsStreaming, IsSharedZA], []>;
+
+  let TargetGuard = "sme-i16i64" in {
+    def SVMLAL_SINGLE_VG4x1_S16 : Inst<"svmla_za64[_{d}]_vg4x1",          "vmdd", "s", MergeNone, "aarch64_sme_smla_za64_single_vg4x1", [IsStreaming, IsSharedZA], []>;
+    def SVMLAL_SINGLE_VG4x1_U16 : Inst<"svmla_za64[_{d}]_vg4x1",          "vmdd", "Us", MergeNone, "aarch64_sme_umla_za64_single_vg4x1", [IsStreaming, IsSharedZA], []>;
+    def SVMLAL_SINGLE_VG4x2_S16 : Inst<"svmla[_single]_za64[_{d}]_vg4x2", "vm2d", "s", MergeNone, "aarch64_sme_smla_za64_single_vg4x2", [IsStreaming, IsSharedZA], []>;
+    def SVMLAL_SINGLE_VG4x2_U16 : Inst<"svmla[_single]_za64[_{d}]_vg4x2", "vm2d", "Us", MergeNone, "aarch64_sme_umla_za64_single_vg4x2", [IsStreaming, IsSharedZA], []>;
+    def SVMLAL_SINGLE_VG4x4_S16 : Inst<"svmla[_single]_za64[_{d}]_vg4x4", "vm4d", "s", MergeNone, "aarch64_sme_smla_za64_single_vg4x4", [IsStreaming, IsSharedZA], []>;
+    def SVMLAL_SINGLE_VG4x4_U16 : Inst<"svmla[_single]_za64[_{d}]_vg4x4", "vm4d", "Us", MergeNone, "aarch64_sme_umla_za64_single_vg4x4", [IsStreaming, IsSharedZA], []>;
+  }
+
+  // SINGLE MLSL
+  def SVMLSL_SINGLE_VG2x1_F16 : Inst<"svmls_za32[_{d}]_vg2x1",          "vmdd", "bh", MergeNone, "aarch64_sme_fmlsl_single_vg2x1", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_SINGLE_VG2x2_F16 : Inst<"svmls[_single]_za32[_{d}]_vg2x2", "vm2d", "bh", MergeNone, "aarch64_sme_fmlsl_single_vg2x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_SINGLE_VG2x4_F16 : Inst<"svmls[_single]_za32[_{d}]_vg2x4", "vm4d", "bh", MergeNone, "aarch64_sme_fmlsl_single_vg2x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_SINGLE_VG2x1_S16 : Inst<"svmls_za32[_{d}]_vg2x1",          "vmdd", "s", MergeNone, "aarch64_sme_smlsl_single_vg2x1", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_SINGLE_VG2x2_S16 : Inst<"svmls[_single]_za32[_{d}]_vg2x2", "vm2d", "s", MergeNone, "aarch64_sme_smlsl_single_vg2x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_SINGLE_VG2x4_S16 : Inst<"svmls[_single]_za32[_{d}]_vg2x4", "vm4d", "s", MergeNone, "aarch64_sme_smlsl_single_vg2x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_SINGLE_VG2x1_U16 : Inst<"svmls_za32[_{d}]_vg2x1",          "vmdd", "Us", MergeNone, "aarch64_sme_umlsl_single_vg2x1", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_SINGLE_VG2x2_U16 : Inst<"svmls[_single]_za32[_{d}]_vg2x2", "vm2d", "Us", MergeNone, "aarch64_sme_umlsl_single_vg2x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_SINGLE_VG2x4_U16 : Inst<"svmls[_single]_za32[_{d}]_vg2x4", "vm4d", "Us", MergeNone, "aarch64_sme_umlsl_single_vg2x4", [IsStreaming, IsSharedZA], []>;
+
+  def SVMLSL_SINGLE_VG4x1_S8  : Inst<"svmls_za32[_{d}]_vg4x1",          "vmdd", "c", MergeNone, "aarch64_sme_smls_za32_single_vg4x1", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_SINGLE_VG4x1_U8  : Inst<"svmls_za32[_{d}]_vg4x1",          "vmdd", "Uc", MergeNone, "aarch64_sme_umls_za32_single_vg4x1", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_SINGLE_VG4x2_S8  : Inst<"svmls[_single]_za32[_{d}]_vg4x2", "vm2d", "c", MergeNone, "aarch64_sme_smls_za32_single_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_SINGLE_VG4x2_U8  : Inst<"svmls[_single]_za32[_{d}]_vg4x2", "vm2d", "Uc", MergeNone, "aarch64_sme_umls_za32_single_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_SINGLE_VG4x4_S8  : Inst<"svmls[_single]_za32[_{d}]_vg4x4", "vm4d", "c", MergeNone, "aarch64_sme_smls_za32_single_vg4x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_SINGLE_VG4x4_U8  : Inst<"svmls[_single]_za32[_{d}]_vg4x4", "vm4d", "Uc", MergeNone, "aarch64_sme_umls_za32_single_vg4x4", [IsStreaming, IsSharedZA], []>;
+
+  let TargetGuard = "sme-i16i64" in {
+    def SVMLSL_SINGLE_VG4x1_S16 : Inst<"svmls_za64[_{d}]_vg4x1",          "vmdd", "s", MergeNone, "aarch64_sme_smls_za64_single_vg4x1", [IsStreaming, IsSharedZA], []>;
+    def SVMLSL_SINGLE_VG4x1_U16 : Inst<"svmls_za64[_{d}]_vg4x1",          "vmdd", "Us", MergeNone, "aarch64_sme_umls_za64_single_vg4x1", [IsStreaming, IsSharedZA], []>;
+    def SVMLSL_SINGLE_VG4x2_S16 : Inst<"svmls[_single]_za64[_{d}]_vg4x2", "vm2d", "s", MergeNone, "aarch64_sme_smls_za64_single_vg4x2", [IsStreaming, IsSharedZA], []>;
+    def SVMLSL_SINGLE_VG4x2_U16 : Inst<"svmls[_single]_za64[_{d}]_vg4x2", "vm2d", "Us", MergeNone, "aarch64_sme_umls_za64_single_vg4x2", [IsStreaming, IsSharedZA], []>;
+    def SVMLSL_SINGLE_VG4x4_S16 : Inst<"svmls[_single]_za64[_{d}]_vg4x4", "vm4d", "s", MergeNone, "aarch64_sme_smls_za64_single_vg4x4", [IsStreaming, IsSharedZA], []>;
+    def SVMLSL_SINGLE_VG4x4_U16 : Inst<"svmls[_single]_za64[_{d}]_vg4x4", "vm4d", "Us", MergeNone, "aarch64_sme_umls_za64_single_vg4x4", [IsStreaming, IsSharedZA], []>;
+  }
+
+  // INDEXED MLAL
+  def SVMLAL_LANE_VG2x1_F16 : Inst<"svmla_lane_za32[_{d}]_vg2x1", "vmddi", "bh", MergeNone, "aarch64_sme_fmlal_lane_vg2x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLAL_LANE_VG2x2_F16 : Inst<"svmla_lane_za32[_{d}]_vg2x2", "vm2di", "bh", MergeNone, "aarch64_sme_fmlal_lane_vg2x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLAL_LANE_VG2x4_F16 : Inst<"svmla_lane_za32[_{d}]_vg2x4", "vm4di", "bh", MergeNone, "aarch64_sme_fmlal_lane_vg2x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLAL_LANE_VG2x1_S16 : Inst<"svmla_lane_za32[_{d}]_vg2x1", "vmddi", "s", MergeNone, "aarch64_sme_smlal_lane_vg2x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLAL_LANE_VG2x2_S16 : Inst<"svmla_lane_za32[_{d}]_vg2x2", "vm2di", "s", MergeNone, "aarch64_sme_smlal_lane_vg2x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLAL_LANE_VG2x4_S16 : Inst<"svmla_lane_za32[_{d}]_vg2x4", "vm4di", "s", MergeNone, "aarch64_sme_smlal_lane_vg2x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLAL_LANE_VG2x1_U16 : Inst<"svmla_lane_za32[_{d}]_vg2x1", "vmddi", "Us", MergeNone, "aarch64_sme_umlal_lane_vg2x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLAL_LANE_VG2x2_U16 : Inst<"svmla_lane_za32[_{d}]_vg2x2", "vm2di", "Us", MergeNone, "aarch64_sme_umlal_lane_vg2x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLAL_LANE_VG2x4_U16 : Inst<"svmla_lane_za32[_{d}]_vg2x4", "vm4di", "Us", MergeNone, "aarch64_sme_umlal_lane_vg2x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+
+  def SVMLAL_LANE_VG4x1_S8 : Inst<"svmla_lane_za32[_{d}]_vg4x1", "vmddi", "c", MergeNone, "aarch64_sme_smla_za32_lane_vg4x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVMLAL_LANE_VG4x1_U8 : Inst<"svmla_lane_za32[_{d}]_vg4x1", "vmddi", "Uc", MergeNone, "aarch64_sme_umla_za32_lane_vg4x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVMLAL_LANE_VG4x2_S8 : Inst<"svmla_lane_za32[_{d}]_vg4x2", "vm2di", "c", MergeNone, "aarch64_sme_smla_za32_lane_vg4x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVMLAL_LANE_VG4x2_U8 : Inst<"svmla_lane_za32[_{d}]_vg4x2", "vm2di", "Uc", MergeNone, "aarch64_sme_umla_za32_lane_vg4x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVMLAL_LANE_VG4x4_S8 : Inst<"svmla_lane_za32[_{d}]_vg4x4", "vm4di", "c", MergeNone, "aarch64_sme_smla_za32_lane_vg4x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVMLAL_LANE_VG4x4_U8 : Inst<"svmla_lane_za32[_{d}]_vg4x4", "vm4di", "Uc", MergeNone, "aarch64_sme_umla_za32_lane_vg4x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+
+  let TargetGuard = "sme-i16i64" in {
+    def SVMLAL_LANE_VG4x1_S16 : Inst<"svmla_lane_za64[_{d}]_vg4x1", "vmddi", "s", MergeNone, "aarch64_sme_smla_za64_lane_vg4x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+    def SVMLAL_LANE_VG4x1_U16 : Inst<"svmla_lane_za64[_{d}]_vg4x1", "vmddi", "Us", MergeNone, "aarch64_sme_umla_za64_lane_vg4x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+    def SVMLAL_LANE_VG4x2_S16 : Inst<"svmla_lane_za64[_{d}]_vg4x2", "vm2di", "s", MergeNone, "aarch64_sme_smla_za64_lane_vg4x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+    def SVMLAL_LANE_VG4x2_U16 : Inst<"svmla_lane_za64[_{d}]_vg4x2", "vm2di", "Us", MergeNone, "aarch64_sme_umla_za64_lane_vg4x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+    def SVMLAL_LANE_VG4x4_S16 : Inst<"svmla_lane_za64[_{d}]_vg4x4", "vm4di", "s", MergeNone, "aarch64_sme_smla_za64_lane_vg4x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+    def SVMLAL_LANE_VG4x4_U16 : Inst<"svmla_lane_za64[_{d}]_vg4x4", "vm4di", "Us", MergeNone, "aarch64_sme_umla_za64_lane_vg4x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  }
+
+  // INDEXED MLSL
+  def SVMLSL_LANE_VG2x1_F16 : Inst<"svmls_lane_za32[_{d}]_vg2x1", "vmddi", "bh", MergeNone, "aarch64_sme_fmlsl_lane_vg2x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLSL_LANE_VG2x2_F16 : Inst<"svmls_lane_za32[_{d}]_vg2x2", "vm2di", "bh", MergeNone, "aarch64_sme_fmlsl_lane_vg2x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLSL_LANE_VG2x4_F16 : Inst<"svmls_lane_za32[_{d}]_vg2x4", "vm4di", "bh", MergeNone, "aarch64_sme_fmlsl_lane_vg2x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLSL_LANE_VG2x1_S16 : Inst<"svmls_lane_za32[_{d}]_vg2x1", "vmddi", "s", MergeNone, "aarch64_sme_smlsl_lane_vg2x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLSL_LANE_VG2x2_S16 : Inst<"svmls_lane_za32[_{d}]_vg2x2", "vm2di", "s", MergeNone, "aarch64_sme_smlsl_lane_vg2x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLSL_LANE_VG2x4_S16 : Inst<"svmls_lane_za32[_{d}]_vg2x4", "vm4di", "s", MergeNone, "aarch64_sme_smlsl_lane_vg2x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLSL_LANE_VG2x1_U16 : Inst<"svmls_lane_za32[_{d}]_vg2x1", "vmddi", "Us", MergeNone, "aarch64_sme_umlsl_lane_vg2x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLSL_LANE_VG2x2_U16 : Inst<"svmls_lane_za32[_{d}]_vg2x2", "vm2di", "Us", MergeNone, "aarch64_sme_umlsl_lane_vg2x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLSL_LANE_VG2x4_U16 : Inst<"svmls_lane_za32[_{d}]_vg2x4", "vm4di", "Us", MergeNone, "aarch64_sme_umlsl_lane_vg2x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+
+  def SVMLSL_LANE_VG4x1_S8 : Inst<"svmls_lane_za32[_{d}]_vg4x1", "vmddi", "c", MergeNone, "aarch64_sme_smls_za32_lane_vg4x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVMLSL_LANE_VG4x1_U8 : Inst<"svmls_lane_za32[_{d}]_vg4x1", "vmddi", "Uc", MergeNone, "aarch64_sme_umls_za32_lane_vg4x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVMLSL_LANE_VG4x2_S8 : Inst<"svmls_lane_za32[_{d}]_vg4x2", "vm2di", "c", MergeNone, "aarch64_sme_smls_za32_lane_vg4x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVMLSL_LANE_VG4x2_U8 : Inst<"svmls_lane_za32[_{d}]_vg4x2", "vm2di", "Uc", MergeNone, "aarch64_sme_umls_za32_lane_vg4x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVMLSL_LANE_VG4x4_S8 : Inst<"svmls_lane_za32[_{d}]_vg4x4", "vm4di", "c", MergeNone, "aarch64_sme_smls_za32_lane_vg4x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVMLSL_LANE_VG4x4_U8 : Inst<"svmls_lane_za32[_{d}]_vg4x4", "vm4di", "Uc", MergeNone, "aarch64_sme_umls_za32_lane_vg4x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+
+  let TargetGuard = "sme-i16i64" in {
+    def SVMLSL_LANE_VG4x1_S16 : Inst<"svmls_lane_za64[_{d}]_vg4x1", "vmddi", "s", MergeNone, "aarch64_sme_smls_za64_lane_vg4x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+    def SVMLSL_LANE_VG4x1_U16 : Inst<"svmls_lane_za64[_{d}]_vg4x1", "vmddi", "Us", MergeNone, "aarch64_sme_umls_za64_lane_vg4x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+    def SVMLSL_LANE_VG4x2_S16 : Inst<"svmls_lane_za64[_{d}]_vg4x2", "vm2di", "s", MergeNone, "aarch64_sme_smls_za64_lane_vg4x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+    def SVMLSL_LANE_VG4x2_U16 : Inst<"svmls_lane_za64[_{d}]_vg4x2", "vm2di", "Us", MergeNone, "aarch64_sme_umls_za64_lane_vg4x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+    def SVMLSL_LANE_VG4x4_S16 : Inst<"svmls_lane_za64[_{d}]_vg4x4", "vm4di", "s", MergeNone, "aarch64_sme_smls_za64_lane_vg4x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+    def SVMLSL_LANE_VG4x4_U16 : Inst<"svmls_lane_za64[_{d}]_vg4x4", "vm4di", "Us", MergeNone, "aarch64_sme_umls_za64_lane_vg4x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  }
+
+  // SINGLE SUMLALL
+  // Single sumla maps to usmla, with zn & zm operands swapped
+  def SVSUMLALL_SINGLE_VG4x1 : Inst<"svsumla_za32[_{d}]_vg4x1",          "vmdu",   "c", MergeNone, "aarch64_sme_usmla_za32_single_vg4x1", [IsStreaming, IsSharedZA], []>;
+
+  def SVSUMLALL_SINGLE_VG4x2 : Inst<"svsumla[_single]_za32[_{d}]_vg4x2", "vm2.du", "c", MergeNone, "aarch64_sme_sumla_za32_single_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVSUMLALL_SINGLE_VG4x4 : Inst<"svsumla[_single]_za32[_{d}]_vg4x4", "vm4.du", "c", MergeNone, "aarch64_sme_sumla_za32_single_vg4x4", [IsStreaming, IsSharedZA], []>;
+
+  // Multi-multi sumla builtins are mapped to usmla, with zn & zm operands swapped
+  def SVSUMLALL_MULTI_VG4x2 : Inst<"svsumla_za32[_{d}]_vg4x2", "vm2.d2.u", "c", MergeNone, "aarch64_sme_usmla_za32_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVSUMLALL_MULTI_VG4x4 : Inst<"svsumla_za32[_{d}]_vg4x4", "vm4.d4.u", "c", MergeNone, "aarch64_sme_usmla_za32_vg4x4", [IsStreaming, IsSharedZA], []>;
+
+  // INDEXED SUMLALL
+  def SVSUMLALL_LANE_VG4x1 : Inst<"svsumla_lane_za32[_{d}]_vg4x1", "vmdui", "c", MergeNone, "aarch64_sme_sumla_za32_lane_vg4x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVSUMLALL_LANE_VG4x2 : Inst<"svsumla_lane_za32[_{d}]_vg4x2", "vm2ui", "c", MergeNone, "aarch64_sme_sumla_za32_lane_vg4x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVSUMLALL_LANE_VG4x4 : Inst<"svsumla_lane_za32[_{d}]_vg4x4", "vm4ui", "c", MergeNone, "aarch64_sme_sumla_za32_lane_vg4x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+
+  // SINGLE USMLALL
+  def SVUSMLALL_SINGLE_VG4x1 : Inst<"svusmla_za32[_{d}]_vg4x1",          "vmdx",   "Uc", MergeNone, "aarch64_sme_usmla_za32_single_vg4x1", [IsStreaming, IsSharedZA], []>;
+  def SVUSMLALL_SINGLE_VG4x2 : Inst<"svusmla[_single]_za32[_{d}]_vg4x2", "vm2.dx", "Uc", MergeNone, "aarch64_sme_usmla_za32_single_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVUSMLALL_SINGLE_VG4x4 : Inst<"svusmla[_single]_za32[_{d}]_vg4x4", "vm4.dx", "Uc", MergeNone, "aarch64_sme_usmla_za32_single_vg4x4", [IsStreaming, IsSharedZA], []>;
+
+  // MULTI USMLALL
+  def SVUSMLALL_MULTI_VG4x2 : Inst<"svusmla_za32[_{d}]_vg4x2", "vm2.d2.x", "Uc", MergeNone, "aarch64_sme_usmla_za32_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVUSMLALL_MULTI_VG4x4 : Inst<"svusmla_za32[_{d}]_vg4x4", "vm4.d4.x", "Uc", MergeNone, "aarch64_sme_usmla_za32_vg4x4", [IsStreaming, IsSharedZA], []>;
+
+  // INDEXED USMLALL
+  def SVUSMLALL_LANE_VG4x1 : Inst<"svusmla_lane_za32[_{d}]_vg4x1", "vmdxi", "Uc", MergeNone, "aarch64_sme_usmla_za32_lane_vg4x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVUSMLALL_LANE_VG4x2 : Inst<"svusmla_lane_za32[_{d}]_vg4x2", "vm2xi", "Uc", MergeNone, "aarch64_sme_usmla_za32_lane_vg4x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVUSMLALL_LANE_VG4x4 : Inst<"svusmla_lane_za32[_{d}]_vg4x4", "vm4xi", "Uc", MergeNone, "aarch64_sme_usmla_za32_lane_vg4x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+}
+
 //
 // Spill and fill of ZT0
 //
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index a29304c81928cc..b009e9f0dac83b 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -10318,6 +10318,28 @@ Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID,
   return nullptr;
 }
 
+static void swapCommutativeSMEOperands(unsigned BuiltinID,
+                                       SmallVectorImpl<Value*> &Ops) {
+  unsigned MultiVec;
+  switch (BuiltinID) {
+  default:
+    return;
+  case SME::BI__builtin_sme_svsumla_za32_s8_vg4x1:
+    MultiVec = 1;
+    break;
+  case SME::BI__builtin_sme_svsumla_za32_s8_vg4x2:
+    MultiVec = 2;
+    break;
+  case SME::BI__builtin_sme_svsumla_za32_s8_vg4x4:
+    MultiVec = 4;
+    break;
+  }
+
+  if (MultiVec > 0)
+    for (unsigned I = 0; I < MultiVec; ++I)
+      std::swap(Ops[I + 1], Ops[I + 1 + MultiVec]);
+}
+
 Value *CodeGenFunction::EmitAArch64SMEBuiltinExpr(unsigned BuiltinID,
                                                   const CallExpr *E) {
   auto *Builtin = findARMVectorIntrinsicInMap(AArch64SMEIntrinsicMap, BuiltinID,
@@ -10340,6 +10362,9 @@ Value *CodeGenFunction::EmitAArch64SMEBuiltinExpr(unsigned BuiltinID,
            BuiltinID == SME::BI__builtin_sme_svstr_za)
     return EmitSMELdrStr(TypeFlags, Ops, Builtin->LLVMIntrinsic);
 
+  // Handle builtins which require their multi-vector operands to be swapped
+  swapCommutativeSMEOperands(BuiltinID, Ops);
+
   // Should not happen!
   if (Builtin->LLVMIntrinsic == 0)
     return nullptr;
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlal.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlal.c
new file mode 100644
index 00000000000000..433d92a20fb841
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlal.c
@@ -0,0 +1,760 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+#include <arm_sme_draft_spec_subject_to_change.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5
+#endif
+
+//
+// Multi, multi
+// CHECK-LABEL: @test_svmla2_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.vg2x2.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmla2_f16j13svfloat16x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.vg2x2.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16x2_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_za32,_f16,_vg2x2,,)(slice_base + 6, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla2_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.vg2x2.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z16test_svmla2_bf16j14svbfloat16x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.vg2x2.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16x2_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_za32,_bf16,_vg2x2,,)(slice_base + 6, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla2_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.vg2x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmla2_u16j12svuint16x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.vg2x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16x2_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_za32,_u16,_vg2x2,,)(slice_base + 6, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla2_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.vg2x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmla2_s16j11svint16x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.vg2x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla2_s16(uint32_t slice_base, svint16x2_t zn, svint16x2_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_za32,_s16,_vg2x2,,)(slice_base + 6, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla4_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.vg2x4.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmla4_f16j13svfloat16x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.vg2x4.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16x4_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_za32,_f16,_vg2x4,,)(slice_base + 6, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla4_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.vg2x4.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z16test_svmla4_bf16j14svbfloat16x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.vg2x4.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16x4_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_za32,_bf16,_vg2x4,,)(slice_base + 6, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla4_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.vg2x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmla4_u16j12svuint16x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.vg2x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_za32,_u16,_vg2x4,,)(slice_base + 6, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla4_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.vg2x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmla4_s16j11svint16x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.vg2x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla4_s16(uint32_t slice_base, svint16x4_t zn, svint16x4_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_za32,_s16,_vg2x4,,)(slice_base + 6, zn, zm);
+}
+
+//
+// Multi, single
+// CHECK-LABEL: @test_svmla_single1_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmla_single1_f16ju13__SVFloat16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single1_f16(uint32_t slice_base, svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_za32,_f16,_vg2x1,,)(slice_base + 14, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single1_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_svmla_single1_bf16ju14__SVBfloat16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single1_bf16(uint32_t slice_base, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_za32,_bf16,_vg2x1,,)(slice_base + 14, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single1_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.single.vg2x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmla_single1_u16ju12__SVUint16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.single.vg2x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_za32,_u16,_vg2x1,,)(slice_base + 14, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single1_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.single.vg2x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmla_single1_s16ju11__SVInt16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.single.vg2x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_za32,_s16,_vg2x1,,)(slice_base + 14, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single2_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmla_single2_f16j13svfloat16x2_tu13__SVFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla,_single,_za32,_f16,_vg2x2)(slice_base + 6, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single2_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_svmla_single2_bf16j14svbfloat16x2_tu14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla,_single,_za32,_bf16,_vg2x2)(slice_base + 6, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single2_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.single.vg2x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmla_single2_u16j12svuint16x2_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.single.vg2x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla,_single,_za32,_u16,_vg2x2)(slice_base + 6, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single2_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.single.vg2x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmla_single2_s16j11svint16x2_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.single.vg2x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla,_single,_za32,_s16,_vg2x2)(slice_base + 6, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single4_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmla_single4_f16j13svfloat16x4_tu13__SVFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla,_single,_za32,_f16,_vg2x4)(slice_base + 6, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single4_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_svmla_single4_bf16j14svbfloat16x4_tu14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla,_single,_za32,_bf16,_vg2x4)(slice_base + 6, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single4_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.single.vg2x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmla_single4_u16j12svuint16x4_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.single.vg2x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla,_single,_za32,_u16,_vg2x4)(slice_base + 6, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single4_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.single.vg2x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmla_single4_s16j11svint16x4_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.single.vg2x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla,_single,_za32,_s16,_vg2x4)(slice_base + 6, zn, zm);
+}
+
+//
+// Multi, indexed
+//
+
+// CHECK-LABEL: @test_svmla_lane1_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmla_lane1_f16ju13__SVFloat16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_lane1_f16(uint32_t slice_base, svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_lane,,_za32,_f16,_vg2x1)(slice_base + 14, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmla_lane1_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_svmla_lane1_bf16ju14__SVBfloat16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_lane1_bf16(uint32_t slice_base, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_lane,,_za32,_bf16,_vg2x1)(slice_base + 14, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmla_lane1_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.lane.vg2x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmla_lane1_u16ju12__SVUint16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.lane.vg2x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_lane1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_lane,,_za32,_u16,_vg2x1)(slice_base + 14, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmla_lane1_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.lane.vg2x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmla_lane1_s16ju11__SVInt16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.lane.vg2x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_lane1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_lane,,_za32,_s16,_vg2x1)(slice_base + 14, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmla_lane2_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmla_lane2_f16j13svfloat16x2_tu13__SVFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_lane2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_lane,,_za32,_f16,_vg2x2)(slice_base + 6, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmla_lane2_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_svmla_lane2_bf16j14svbfloat16x2_tu14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_lane2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_lane,,_za32,_bf16,_vg2x2)(slice_base + 6, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmla_lane2_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.lane.vg2x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmla_lane2_u16j12svuint16x2_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.lane.vg2x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_lane2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_lane,,_za32,_u16,_vg2x2)(slice_base + 6, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmla_lane2_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.lane.vg2x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmla_lane2_s16j11svint16x2_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.lane.vg2x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_lane2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_lane,,_za32,_s16,_vg2x2)(slice_base + 6, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmla_lane4_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmla_lane4_f16j13svfloat16x4_tu13__SVFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_lane4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_lane,,_za32,_f16,_vg2x4)(slice_base + 6, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmla_lane4_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_svmla_lane4_bf16j14svbfloat16x4_tu14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_lane4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_lane,,_za32,_bf16,_vg2x4)(slice_base + 6, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmla_lane4_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.lane.vg2x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmla_lane4_u16j12svuint16x4_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.lane.vg2x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_lane4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_lane,,_za32,_u16,_vg2x4)(slice_base + 6, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmla_lane4_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.lane.vg2x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmla_lane4_s16j11svint16x4_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.lane.vg2x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_lane4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_lane,,_za32,_s16,_vg2x4)(slice_base + 6, zn, zm, 7);
+}
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlall.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlall.c
new file mode 100644
index 00000000000000..7d00ddafb84c5b
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlall.c
@@ -0,0 +1,1950 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -target-feature +sme-i16i64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+#include <arm_sme_draft_spec_subject_to_change.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5
+#endif
+
+//
+// Single x 1
+//
+
+// MLAL
+
+// CHECK-LABEL: @test_svmla_single_x1_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.single.vg4x1.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_svmla_single_x1_s8ju10__SVInt8_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.single.vg4x1.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single_x1_s8(uint32_t slice_base, svint8_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_za32,_s8,_vg4x1,,)(slice_base + 12, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single_x1_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.single.vg4x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_svmla_single_x1_s16ju11__SVInt16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.single.vg4x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single_x1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_za64,_s16,_vg4x1,,)(slice_base + 12, zn, zm);
+}
+
+// CHECK-LABEL: @test_uvmlal_single_x1_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.single.vg4x1.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_uvmlal_single_x1_u8ju11__SVUint8_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.single.vg4x1.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_uvmlal_single_x1_u8(uint32_t slice_base, svuint8_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_za32,_u8,_vg4x1,,)(slice_base + 12, zn, zm);
+}
+
+// CHECK-LABEL: @test_uvmlal_single_x1_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.single.vg4x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z25test_uvmlal_single_x1_u16ju12__SVUint16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.single.vg4x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_uvmlal_single_x1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_za64,_u16,_vg4x1,,)(slice_base + 12, zn, zm);
+}
+
+// MLSL
+
+// CHECK-LABEL: @test_svmls_single_x1_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.single.vg4x1.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_svmls_single_x1_s8ju10__SVInt8_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.single.vg4x1.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single_x1_s8(uint32_t slice_base, svint8_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_za32,_s8,_vg4x1,,)(slice_base + 12, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single_x1_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.single.vg4x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_svmls_single_x1_s16ju11__SVInt16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.single.vg4x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single_x1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_za64,_s16,_vg4x1,,)(slice_base + 12, zn, zm);
+}
+
+// CHECK-LABEL: @test_uvmlsl_single_x1_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.single.vg4x1.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_uvmlsl_single_x1_u8ju11__SVUint8_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.single.vg4x1.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_uvmlsl_single_x1_u8(uint32_t slice_base, svuint8_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_za32,_u8,_vg4x1,,)(slice_base + 12, zn, zm);
+}
+
+// CHECK-LABEL: @test_uvmlsl_single_x1_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.single.vg4x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z25test_uvmlsl_single_x1_u16ju12__SVUint16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.single.vg4x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_uvmlsl_single_x1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_za64,_u16,_vg4x1,,)(slice_base + 12, zn, zm);
+}
+
+// SUMLALL
+
+// CHECK-LABEL: @test_sumlall_single_x1_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.single.vg4x1.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[ZM:%.*]], <vscale x 16 x i8> [[ZN:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z25test_sumlall_single_x1_s8ju10__SVInt8_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.single.vg4x1.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[ZM:%.*]], <vscale x 16 x i8> [[ZN:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_sumlall_single_x1_s8(uint32_t slice_base, svint8_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svsumla_za32,_s8,_vg4x1,,)(slice_base + 12, zn, zm);
+}
+
+// USMLALL
+
+// CHECK-LABEL: @test_usmlall_single_x1_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.single.vg4x1.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z25test_usmlall_single_x1_u8ju11__SVUint8_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.single.vg4x1.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_usmlall_single_x1_u8(uint32_t slice_base, svuint8_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svusmla_za32,_u8,_vg4x1,,)(slice_base + 12, zn, zm);
+}
+
+//
+// Single x 2
+//
+
+// MLAL
+
+// CHECK-LABEL: @test_svmla_single_x2_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.single.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_svmla_single_x2_s8j10svint8x2_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.single.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla,_single,_za32,_s8,_vg4x2)(slice_base + 4, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single_x2_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.single.vg4x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_svmla_single_x2_s16j11svint16x2_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.single.vg4x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla,_single,_za64,_s16,_vg4x2)(slice_base + 4, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single_x2_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.single.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_svmla_single_x2_u8j11svuint8x2_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.single.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla,_single,_za32,_u8,_vg4x2)(slice_base + 4, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single_x2_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.single.vg4x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_svmla_single_x2_u16j12svuint16x2_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.single.vg4x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla,_single,_za64,_u16,_vg4x2)(slice_base + 4, zn, zm);
+}
+
+// MLSL
+
+// CHECK-LABEL: @test_svmls_single_x2_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.single.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_svmls_single_x2_s8j10svint8x2_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.single.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls,_single,_za32,_s8,_vg4x2)(slice_base + 4, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single_x2_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.single.vg4x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_svmls_single_x2_s16j11svint16x2_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.single.vg4x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls,_single,_za64,_s16,_vg4x2)(slice_base + 4, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single_x2_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.single.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_svmls_single_x2_u8j11svuint8x2_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.single.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls,_single,_za32,_u8,_vg4x2)(slice_base + 4, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single_x2_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.single.vg4x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_svmls_single_x2_u16j12svuint16x2_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.single.vg4x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls,_single,_za64,_u16,_vg4x2)(slice_base + 4, zn, zm);
+}
+
+// SUMLALL
+
+// CHECK-LABEL: @test_svsumla_single_x2_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumla.za32.single.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z25test_svsumla_single_x2_s8j10svint8x2_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumla.za32.single.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svsumla_single_x2_s8(uint32_t slice_base, svint8x2_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svsumla,_single,_za32,_s8,_vg4x2)(slice_base + 4, zn, zm);
+}
+
+// USMLALL
+
+// CHECK-LABEL: @test_usmlall_single_x2_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.single.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z25test_usmlall_single_x2_u8j11svuint8x2_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.single.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_usmlall_single_x2_u8(uint32_t slice_base, svuint8x2_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svusmla,_single,_za32,_u8,_vg4x2)(slice_base + 4, zn, zm);
+}
+
+//
+// Single x 4
+//
+
+// MLAL
+
+// CHECK-LABEL: @test_svmla_single_x4_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.single.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_svmla_single_x4_s8j10svint8x4_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.single.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla,_single,_za32,_s8,_vg4x4)(slice_base + 4, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single_x4_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.single.vg4x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_svmla_single_x4_s16j11svint16x4_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.single.vg4x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla,_single,_za64,_s16,_vg4x4)(slice_base + 4, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single_x4_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.single.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_svmla_single_x4_u8j11svuint8x4_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.single.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla,_single,_za32,_u8,_vg4x4)(slice_base + 4, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single_x4_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.single.vg4x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_svmla_single_x4_u16j12svuint16x4_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.single.vg4x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla,_single,_za64,_u16,_vg4x4)(slice_base + 4, zn, zm);
+}
+
+// MLSL
+
+// CHECK-LABEL: @test_svmls_single_x4_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.single.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_svmls_single_x4_s8j10svint8x4_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.single.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls,_single,_za32,_s8,_vg4x4)(slice_base + 4, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single_x4_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.single.vg4x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_svmls_single_x4_s16j11svint16x4_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.single.vg4x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls,_single,_za64,_s16,_vg4x4)(slice_base + 4, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single_x4_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.single.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_svmls_single_x4_u8j11svuint8x4_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.single.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls,_single,_za32,_u8,_vg4x4)(slice_base + 4, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single_x4_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.single.vg4x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_svmls_single_x4_u16j12svuint16x4_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.single.vg4x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls,_single,_za64,_u16,_vg4x4)(slice_base + 4, zn, zm);
+}
+
+// SUMLALL
+
+// CHECK-LABEL: @test_svsumla_single_x4_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumla.za32.single.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z25test_svsumla_single_x4_s8j10svint8x4_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumla.za32.single.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svsumla_single_x4_s8(uint32_t slice_base, svint8x4_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svsumla,_single,_za32,_s8,_vg4x4)(slice_base + 4, zn, zm);
+}
+
+// USMLALL
+
+// CHECK-LABEL: @test_usmlall_single_x4_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.single.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z25test_usmlall_single_x4_u8j11svuint8x4_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.single.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_usmlall_single_x4_u8(uint32_t slice_base, svuint8x4_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svusmla,_single,_za32,_u8,_vg4x4)(slice_base + 4, zn, zm);
+}
+
+//
+// Multi x 2
+//
+
+// MLAL
+
+// CHECK-LABEL: @test_mlal_multi_x2_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_mlal_multi_x2_s8j10svint8x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_mlal_multi_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8x2_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_za32,_s8,_vg4x2,,)(slice_base + 4, zn, zm);
+}
+
+// CHECK-LABEL: @test_mlal_multi_x2_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.vg4x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_mlal_multi_x2_s16j11svint16x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.vg4x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_mlal_multi_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16x2_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_za64,_s16,_vg4x2,,)(slice_base + 4, zn, zm);
+}
+
+// CHECK-LABEL: @test_mlal_multi_x2_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_mlal_multi_x2_u8j11svuint8x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_mlal_multi_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8x2_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_za32,_u8,_vg4x2,,)(slice_base + 4, zn, zm);
+}
+
+// CHECK-LABEL: @test_mlal_multi_x2_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.vg4x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_mlal_multi_x2_u16j12svuint16x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.vg4x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_mlal_multi_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16x2_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_za64,_u16,_vg4x2,,)(slice_base + 4, zn, zm);
+}
+
+// MLSL
+
+// CHECK-LABEL: @test_mlsl_multi_x2_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_mlsl_multi_x2_s8j10svint8x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_mlsl_multi_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8x2_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_za32,_s8,_vg4x2,,)(slice_base + 4, zn, zm);
+}
+
+// CHECK-LABEL: @test_mlsl_multi_x2_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.vg4x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_mlsl_multi_x2_s16j11svint16x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.vg4x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_mlsl_multi_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16x2_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_za64,_s16,_vg4x2,,)(slice_base + 4, zn, zm);
+}
+
+// CHECK-LABEL: @test_mlsl_multi_x2_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_mlsl_multi_x2_u8j11svuint8x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_mlsl_multi_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8x2_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_za32,_u8,_vg4x2,,)(slice_base + 4, zn, zm);
+}
+
+// CHECK-LABEL: @test_mlsl_multi_x2_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.vg4x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_mlsl_multi_x2_u16j12svuint16x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.vg4x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_mlsl_multi_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16x2_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_za64,_u16,_vg4x2,,)(slice_base + 4, zn, zm);
+}
+
+// SUMLALL
+
+// CHECK-LABEL: @test_sumlal_multi_x2_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_sumlal_multi_x2_s8j10svint8x2_t11svuint8x2_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_sumlal_multi_x2_s8(uint32_t slice_base, svint8x2_t zn, svuint8x2_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svsumla_za32,_s8,_vg4x2,,)(slice_base + 4, zn, zm);
+}
+
+// USMLALL
+
+// CHECK-LABEL: @test_usmlal_multi_x2_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_usmlal_multi_x2_u8j11svuint8x2_t10svint8x2_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_usmlal_multi_x2_u8(uint32_t slice_base, svuint8x2_t zn, svint8x2_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svusmla_za32,_u8,_vg4x2,,)(slice_base + 4, zn, zm);
+}
+
+//
+// Multi x 4
+//
+
+// MLAL
+
+// CHECK-LABEL: @test_mlal_multi_x4_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_mlal_multi_x4_s8j10svint8x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_mlal_multi_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8x4_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_za32,_s8,_vg4x4,,)(slice_base + 4, zn, zm);
+}
+
+// CHECK-LABEL: @test_mlal_multi_x4_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.vg4x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_mlal_multi_x4_s16j11svint16x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.vg4x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_mlal_multi_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16x4_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_za64,_s16,_vg4x4,,)(slice_base + 4, zn, zm);
+}
+
+// CHECK-LABEL: @test_mlal_multi_x4_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_mlal_multi_x4_u8j11svuint8x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_mlal_multi_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8x4_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_za32,_u8,_vg4x4,,)(slice_base + 4, zn, zm);
+}
+
+// CHECK-LABEL: @test_mlal_multi_x4_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.vg4x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_mlal_multi_x4_u16j12svuint16x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.vg4x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_mlal_multi_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_za64,_u16,_vg4x4,,)(slice_base + 4, zn, zm);
+}
+
+// MLSL
+
+// CHECK-LABEL: @test_mlsl_multi_x4_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_mlsl_multi_x4_s8j10svint8x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_mlsl_multi_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8x4_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_za32,_s8,_vg4x4,,)(slice_base + 4, zn, zm);
+}
+
+// CHECK-LABEL: @test_mlsl_multi_x4_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.vg4x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_mlsl_multi_x4_s16j11svint16x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.vg4x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_mlsl_multi_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16x4_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_za64,_s16,_vg4x4,,)(slice_base + 4, zn, zm);
+}
+
+// CHECK-LABEL: @test_mlsl_multi_x4_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_mlsl_multi_x4_u8j11svuint8x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_mlsl_multi_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8x4_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_za32,_u8,_vg4x4,,)(slice_base + 4, zn, zm);
+}
+
+// CHECK-LABEL: @test_mlsl_multi_x4_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.vg4x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_mlsl_multi_x4_u16j12svuint16x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.vg4x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_mlsl_multi_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_za64,_u16,_vg4x4,,)(slice_base + 4, zn, zm);
+}
+
+// SUMLALL
+
+// CHECK-LABEL: @test_sumlal_multi_x4_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_sumlal_multi_x4_s8j10svint8x4_t11svuint8x4_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_sumlal_multi_x4_s8(uint32_t slice_base, svint8x4_t zn, svuint8x4_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svsumla_za32,_s8,_vg4x4,,)(slice_base + 4, zn, zm);
+}
+
+// USMLALL
+
+// CHECK-LABEL: @test_usmlal_multi_x4_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_usmlal_multi_x4_u8j11svuint8x4_t10svint8x4_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_usmlal_multi_x4_u8(uint32_t slice_base, svuint8x4_t zn, svint8x4_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svusmla_za32,_u8,_vg4x4,,)(slice_base + 4, zn, zm);
+}
+
+//
+// Indexed x 1
+//
+
+// SMLAL
+
+// CHECK-LABEL: @test_smlal_lane_x1_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.lane.vg4x1.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_smlal_lane_x1_s8ju10__SVInt8_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.lane.vg4x1.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlal_lane_x1_s8(uint32_t slice_base, svint8_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_lane_za32,_s8,_vg4x1,,)(slice_base + 12, zn, zm, 15);
+}
+
+// CHECK-LABEL: @test_smlal_lane_x1_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.lane.vg4x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_smlal_lane_x1_s16ju11__SVInt16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.lane.vg4x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlal_lane_x1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_lane_za64,_s16,_vg4x1,,)(slice_base + 12, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_smlal_lane_x1_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.lane.vg4x1.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_smlal_lane_x1_u8ju11__SVUint8_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.lane.vg4x1.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlal_lane_x1_u8(uint32_t slice_base, svuint8_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_lane_za32,_u8,_vg4x1,,)(slice_base + 12, zn, zm, 15);
+}
+
+// CHECK-LABEL: @test_smlal_lane_x1_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.lane.vg4x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_smlal_lane_x1_u16ju12__SVUint16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.lane.vg4x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlal_lane_x1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_lane_za64,_u16,_vg4x1,,)(slice_base + 12, zn, zm, 7);
+}
+
+// SMLSL
+
+// CHECK-LABEL: @test_smlsl_lane_x1_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.lane.vg4x1.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_smlsl_lane_x1_s8ju10__SVInt8_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.lane.vg4x1.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlsl_lane_x1_s8(uint32_t slice_base, svint8_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_lane_za32,_s8,_vg4x1,,)(slice_base + 12, zn, zm, 15);
+}
+
+// CHECK-LABEL: @test_smlsl_lane_x1_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.lane.vg4x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_smlsl_lane_x1_s16ju11__SVInt16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.lane.vg4x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlsl_lane_x1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_lane_za64,_s16,_vg4x1,,)(slice_base + 12, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_smlsl_lane_x1_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.lane.vg4x1.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_smlsl_lane_x1_u8ju11__SVUint8_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.lane.vg4x1.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlsl_lane_x1_u8(uint32_t slice_base, svuint8_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_lane_za32,_u8,_vg4x1,,)(slice_base + 12, zn, zm, 15);
+}
+
+// CHECK-LABEL: @test_smlsl_lane_x1_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.lane.vg4x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_smlsl_lane_x1_u16ju12__SVUint16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.lane.vg4x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlsl_lane_x1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_lane_za64,_u16,_vg4x1,,)(slice_base + 12, zn, zm, 7);
+}
+
+// SUMLALL
+
+// CHECK-LABEL: @test_sumlall_lane_x1_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumla.za32.lane.vg4x1.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_sumlall_lane_x1_s8ju10__SVInt8_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumla.za32.lane.vg4x1.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_sumlall_lane_x1_s8(uint32_t slice_base, svint8_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svsumla_lane_za32,_s8,_vg4x1,,)(slice_base + 12, zn, zm, 15);
+}
+
+// USMLALL
+
+// CHECK-LABEL: @test_usmlall_lane_x1_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.lane.vg4x1.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_usmlall_lane_x1_u8ju11__SVUint8_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.lane.vg4x1.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_usmlall_lane_x1_u8(uint32_t slice_base, svuint8_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svusmla_lane_za32,_u8,_vg4x1,,)(slice_base + 12, zn, zm, 15);
+}
+
+//
+// Indexed x 2
+//
+
+// SMLAL
+
+// CHECK-LABEL: @test_smlal_lane_x2_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.lane.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_smlal_lane_x2_s8j10svint8x2_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.lane.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlal_lane_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_lane_za32,_s8,_vg4x2,,)(slice_base + 4, zn, zm, 15);
+}
+
+// CHECK-LABEL: @test_smlal_lane_x2_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.lane.vg4x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_smlal_lane_x2_s16j11svint16x2_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.lane.vg4x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlal_lane_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_lane_za64,_s16,_vg4x2,,)(slice_base + 4, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_smlal_lane_x2_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.lane.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_smlal_lane_x2_u8j11svuint8x2_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.lane.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlal_lane_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_lane_za32,_u8,_vg4x2,,)(slice_base + 4, zn, zm, 15);
+}
+
+// CHECK-LABEL: @test_smlal_lane_x2_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.lane.vg4x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_smlal_lane_x2_u16j12svuint16x2_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.lane.vg4x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlal_lane_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_lane_za64,_u16,_vg4x2,,)(slice_base + 4, zn, zm, 7);
+}
+
+// SMLSL
+
+// CHECK-LABEL: @test_smlsl_lane_x2_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.lane.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_smlsl_lane_x2_s8j10svint8x2_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.lane.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlsl_lane_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_lane_za32,_s8,_vg4x2,,)(slice_base + 4, zn, zm, 15);
+}
+
+// CHECK-LABEL: @test_smlsl_lane_x2_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.lane.vg4x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_smlsl_lane_x2_s16j11svint16x2_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.lane.vg4x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlsl_lane_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_lane_za64,_s16,_vg4x2,,)(slice_base + 4, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_smlsl_lane_x2_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.lane.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_smlsl_lane_x2_u8j11svuint8x2_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.lane.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlsl_lane_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_lane_za32,_u8,_vg4x2,,)(slice_base + 4, zn, zm, 15);
+}
+
+// CHECK-LABEL: @test_smlsl_lane_x2_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.lane.vg4x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_smlsl_lane_x2_u16j12svuint16x2_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.lane.vg4x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlsl_lane_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_lane_za64,_u16,_vg4x2,,)(slice_base + 4, zn, zm, 7);
+}
+
+// SUMLALL
+
+// CHECK-LABEL: @test_sumlall_lane_x2_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumla.za32.lane.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_sumlall_lane_x2_s8j10svint8x2_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumla.za32.lane.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_sumlall_lane_x2_s8(uint32_t slice_base, svint8x2_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svsumla_lane_za32,_s8,_vg4x2,,)(slice_base + 4, zn, zm, 15);
+}
+
+// CHECK-LABEL: @test_usmlall_lane_x2_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.lane.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_usmlall_lane_x2_u8j11svuint8x2_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.lane.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_usmlall_lane_x2_u8(uint32_t slice_base, svuint8x2_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svusmla_lane_za32,_u8,_vg4x2,,)(slice_base + 4, zn, zm, 15);
+}
+
+//
+// Indexed x 4
+//
+
+// MLAL
+
+// CHECK-LABEL: @test_smlal_lane_x4_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.lane.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_smlal_lane_x4_s8j10svint8x4_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.lane.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlal_lane_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_lane_za32,_s8,_vg4x4,,)(slice_base + 4, zn, zm, 15);
+}
+
+// CHECK-LABEL: @test_smlal_lane_x4_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.lane.vg4x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_smlal_lane_x4_s16j11svint16x4_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.lane.vg4x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlal_lane_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_lane_za64,_s16,_vg4x4,,)(slice_base + 4, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_smlal_lane_x4_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.lane.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_smlal_lane_x4_u8j11svuint8x4_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.lane.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlal_lane_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_lane_za32,_u8,_vg4x4,,)(slice_base + 4, zn, zm, 15);
+}
+
+// CHECK-LABEL: @test_smlal_lane_x4_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.lane.vg4x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_smlal_lane_x4_u16j12svuint16x4_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.lane.vg4x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlal_lane_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_lane_za64,_u16,_vg4x4,,)(slice_base + 4, zn, zm, 7);
+}
+
+// MLSL
+
+// CHECK-LABEL: @test_smlsl_lane_x4_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.lane.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_smlsl_lane_x4_s8j10svint8x4_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.lane.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlsl_lane_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_lane_za32,_s8,_vg4x4,,)(slice_base + 4, zn, zm, 15);
+}
+
+// CHECK-LABEL: @test_smlsl_lane_x4_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.lane.vg4x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_smlsl_lane_x4_s16j11svint16x4_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.lane.vg4x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlsl_lane_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_lane_za64,_s16,_vg4x4,,)(slice_base + 4, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_smlsl_lane_x4_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.lane.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_smlsl_lane_x4_u8j11svuint8x4_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.lane.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlsl_lane_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_lane_za32,_u8,_vg4x4,,)(slice_base + 4, zn, zm, 15);
+}
+
+// CHECK-LABEL: @test_smlsl_lane_x4_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.lane.vg4x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_smlsl_lane_x4_u16j12svuint16x4_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.lane.vg4x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlsl_lane_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_lane_za64,_u16,_vg4x4,,)(slice_base + 4, zn, zm, 7);
+}
+
+// SUMLALL
+
+// CHECK-LABEL: @test_sumlall_lane_x4_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumla.za32.lane.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_sumlall_lane_x4_s8j10svint8x4_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumla.za32.lane.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_sumlall_lane_x4_s8(uint32_t slice_base, svint8x4_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svsumla_lane_za32,_s8,_vg4x4,,)(slice_base + 4, zn, zm, 15);
+}
+
+// USMLALL
+
+// CHECK-LABEL: @test_usmlall_lane_x4_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.lane.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_usmlall_lane_x4_s8j11svuint8x4_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.lane.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_usmlall_lane_x4_s8(uint32_t slice_base, svuint8x4_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svusmla_lane_za32,_u8,_vg4x4,,)(slice_base + 4, zn, zm, 15);
+}
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlsl.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlsl.c
new file mode 100644
index 00000000000000..e8528271c81d37
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlsl.c
@@ -0,0 +1,760 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+#include <arm_sme_draft_spec_subject_to_change.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5
+#endif
+
+//
+// Multi, multi
+// CHECK-LABEL: @test_svmls2_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmls2_f16j13svfloat16x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16x2_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_za32,_f16,_vg2x2,,)(slice_base + 6, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls2_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z16test_svmls2_bf16j14svbfloat16x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16x2_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_za32,_bf16,_vg2x2,,)(slice_base + 6, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls2_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.vg2x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmls2_u16j12svuint16x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.vg2x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16x2_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_za32,_u16,_vg2x2,,)(slice_base + 6, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls2_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.vg2x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmls2_s16j11svint16x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.vg2x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls2_s16(uint32_t slice_base, svint16x2_t zn, svint16x2_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_za32,_s16,_vg2x2,,)(slice_base + 6, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls4_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmls4_f16j13svfloat16x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16x4_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_za32,_f16,_vg2x4,,)(slice_base + 6, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls4_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z16test_svmls4_bf16j14svbfloat16x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16x4_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_za32,_bf16,_vg2x4,,)(slice_base + 6, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls4_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.vg2x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmls4_u16j12svuint16x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.vg2x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_za32,_u16,_vg2x4,,)(slice_base + 6, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls4_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.vg2x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmls4_s16j11svint16x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.vg2x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls4_s16(uint32_t slice_base, svint16x4_t zn, svint16x4_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_za32,_s16,_vg2x4,,)(slice_base + 6, zn, zm);
+}
+
+//
+// Multi, single
+// CHECK-LABEL: @test_svmls_single1_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmls_single1_f16ju13__SVFloat16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single1_f16(uint32_t slice_base, svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_za32,_f16,_vg2x1,,)(slice_base + 14, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single1_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_svmls_single1_bf16ju14__SVBfloat16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single1_bf16(uint32_t slice_base, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_za32,_bf16,_vg2x1,,)(slice_base + 14, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single1_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.single.vg2x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmls_single1_u16ju12__SVUint16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.single.vg2x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_za32,_u16,_vg2x1,,)(slice_base + 14, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single1_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.single.vg2x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmls_single1_s16ju11__SVInt16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.single.vg2x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_za32,_s16,_vg2x1,,)(slice_base + 14, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single2_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmls_single2_f16j13svfloat16x2_tu13__SVFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls,_single,_za32,_f16,_vg2x2)(slice_base + 6, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single2_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_svmls_single2_bf16j14svbfloat16x2_tu14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls,_single,_za32,_bf16,_vg2x2)(slice_base + 6, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single2_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.single.vg2x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmls_single2_u16j12svuint16x2_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.single.vg2x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls,_single,_za32,_u16,_vg2x2)(slice_base + 6, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single2_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.single.vg2x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmls_single2_s16j11svint16x2_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.single.vg2x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls,_single,_za32,_s16,_vg2x2)(slice_base + 6, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single4_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmls_single4_f16j13svfloat16x4_tu13__SVFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls,_single,_za32,_f16,_vg2x4)(slice_base + 6, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single4_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_svmls_single4_bf16j14svbfloat16x4_tu14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls,_single,_za32,_bf16,_vg2x4)(slice_base + 6, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single4_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.single.vg2x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmls_single4_u16j12svuint16x4_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.single.vg2x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls,_single,_za32,_u16,_vg2x4)(slice_base + 6, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single4_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.single.vg2x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmls_single4_s16j11svint16x4_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.single.vg2x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls,_single,_za32,_s16,_vg2x4)(slice_base + 6, zn, zm);
+}
+
+//
+// Multi, indexed
+//
+
+// CHECK-LABEL: @test_svmls_lane1_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmls_lane1_f16ju13__SVFloat16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_lane1_f16(uint32_t slice_base, svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_lane,,_za32,_f16,_vg2x1)(slice_base + 14, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmls_lane1_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_svmls_lane1_bf16ju14__SVBfloat16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_lane1_bf16(uint32_t slice_base, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_lane,,_za32,_bf16,_vg2x1)(slice_base + 14, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmls_lane1_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.lane.vg2x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmls_lane1_u16ju12__SVUint16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.lane.vg2x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_lane1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_lane,,_za32,_u16,_vg2x1)(slice_base + 14, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmls_lane1_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.lane.vg2x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmls_lane1_s16ju11__SVInt16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.lane.vg2x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_lane1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_lane,,_za32,_s16,_vg2x1)(slice_base + 14, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmls_lane2_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmls_lane2_f16j13svfloat16x2_tu13__SVFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_lane2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_lane,,_za32,_f16,_vg2x2)(slice_base + 6, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmls_lane2_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_svmls_lane2_bf16j14svbfloat16x2_tu14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_lane2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_lane,,_za32,_bf16,_vg2x2)(slice_base + 6, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmls_lane2_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.lane.vg2x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmls_lane2_u16j12svuint16x2_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.lane.vg2x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_lane2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_lane,,_za32,_u16,_vg2x2)(slice_base + 6, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmls_lane2_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.lane.vg2x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmls_lane2_s16j11svint16x2_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.lane.vg2x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_lane2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_lane,,_za32,_s16,_vg2x2)(slice_base + 6, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmls_lane4_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmls_lane4_f16j13svfloat16x4_tu13__SVFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_lane4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_lane,,_za32,_f16,_vg2x4)(slice_base + 6, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmls_lane4_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_svmls_lane4_bf16j14svbfloat16x4_tu14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_lane4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_lane,,_za32,_bf16,_vg2x4)(slice_base + 6, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmls_lane4_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.lane.vg2x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmls_lane4_u16j12svuint16x4_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.lane.vg2x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_lane4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_lane,,_za32,_u16,_vg2x4)(slice_base + 6, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmls_lane4_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.lane.vg2x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmls_lane4_s16j11svint16x4_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.lane.vg2x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_lane4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_lane,,_za32,_s16,_vg2x4)(slice_base + 6, zn, zm, 7);
+}

>From 52d968ae4af75b81452468216817a5329ceed587 Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <Dinar.Temirbulatov at arm.com>
Date: Fri, 15 Dec 2023 11:54:25 +0000
Subject: [PATCH 2/7] Fixed format error in clang/lib/CodeGen/CGBuiltin.cpp

---
 clang/lib/CodeGen/CGBuiltin.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index b009e9f0dac83b..7bc3b7594c8f3c 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -10319,7 +10319,7 @@ Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID,
 }
 
 static void swapCommutativeSMEOperands(unsigned BuiltinID,
-                                       SmallVectorImpl<Value*> &Ops) {
+                                       SmallVectorImpl<Value *> &Ops) {
   unsigned MultiVec;
   switch (BuiltinID) {
   default:

>From 53a83736777197ec39ee0773c8e3af1f972bb439 Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <Dinar.Temirbulatov at arm.com>
Date: Fri, 15 Dec 2023 12:17:55 +0000
Subject: [PATCH 3/7] Replaced all slice_base + n to slice_base

---
 .../aarch64-sme2-intrinsics/acle_sme2_mlal.c  | 256 +++----
 .../aarch64-sme2-intrinsics/acle_sme2_mlall.c | 640 +++++++-----------
 .../aarch64-sme2-intrinsics/acle_sme2_mlsl.c  | 256 +++----
 3 files changed, 432 insertions(+), 720 deletions(-)

diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlal.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlal.c
index 433d92a20fb841..c216e4cce944fd 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlal.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlal.c
@@ -19,107 +19,98 @@
 // Multi, multi
 // CHECK-LABEL: @test_svmla2_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.vg2x2.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.vg2x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z15test_svmla2_f16j13svfloat16x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.vg2x2.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.vg2x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmla2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16x2_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmla_za32,_f16,_vg2x2,,)(slice_base + 6, zn, zm);
+   SVE_ACLE_FUNC(svmla_za32,_f16,_vg2x2,,)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmla2_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.vg2x2.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.vg2x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z16test_svmla2_bf16j14svbfloat16x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.vg2x2.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.vg2x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmla2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16x2_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmla_za32,_bf16,_vg2x2,,)(slice_base + 6, zn, zm);
+   SVE_ACLE_FUNC(svmla_za32,_bf16,_vg2x2,,)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmla2_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.vg2x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z15test_svmla2_u16j12svuint16x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.vg2x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmla2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16x2_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmla_za32,_u16,_vg2x2,,)(slice_base + 6, zn, zm);
+   SVE_ACLE_FUNC(svmla_za32,_u16,_vg2x2,,)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmla2_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.vg2x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z15test_svmla2_s16j11svint16x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.vg2x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmla2_s16(uint32_t slice_base, svint16x2_t zn, svint16x2_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmla_za32,_s16,_vg2x2,,)(slice_base + 6, zn, zm);
+   SVE_ACLE_FUNC(svmla_za32,_s16,_vg2x2,,)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmla4_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
@@ -128,12 +119,11 @@ void test_svmla2_s16(uint32_t slice_base, svint16x2_t zn, svint16x2_t zm) __arm_
 // CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 8)
 // CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 16)
 // CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 24)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.vg2x4.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.vg2x4.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z15test_svmla4_f16j13svfloat16x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
@@ -142,17 +132,16 @@ void test_svmla2_s16(uint32_t slice_base, svint16x2_t zn, svint16x2_t zm) __arm_
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 24)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.vg2x4.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.vg2x4.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmla4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16x4_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmla_za32,_f16,_vg2x4,,)(slice_base + 6, zn, zm);
+   SVE_ACLE_FUNC(svmla_za32,_f16,_vg2x4,,)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmla4_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
@@ -161,12 +150,11 @@ void test_svmla4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16x4_t zm) __
 // CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 8)
 // CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 16)
 // CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 24)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.vg2x4.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.vg2x4.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z16test_svmla4_bf16j14svbfloat16x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
@@ -175,17 +163,16 @@ void test_svmla4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16x4_t zm) __
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 24)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.vg2x4.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.vg2x4.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmla4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16x4_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmla_za32,_bf16,_vg2x4,,)(slice_base + 6, zn, zm);
+   SVE_ACLE_FUNC(svmla_za32,_bf16,_vg2x4,,)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmla4_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
@@ -194,12 +181,11 @@ void test_svmla4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16x4_t zm)
 // CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
 // CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
 // CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.vg2x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z15test_svmla4_u16j12svuint16x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
@@ -208,17 +194,16 @@ void test_svmla4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16x4_t zm)
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.vg2x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmla4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmla_za32,_u16,_vg2x4,,)(slice_base + 6, zn, zm);
+   SVE_ACLE_FUNC(svmla_za32,_u16,_vg2x4,,)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmla4_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
@@ -227,12 +212,11 @@ void test_svmla4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t zm) __ar
 // CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
 // CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
 // CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.vg2x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z15test_svmla4_s16j11svint16x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
@@ -241,266 +225,242 @@ void test_svmla4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t zm) __ar
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.vg2x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmla4_s16(uint32_t slice_base, svint16x4_t zn, svint16x4_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmla_za32,_s16,_vg2x4,,)(slice_base + 6, zn, zm);
+   SVE_ACLE_FUNC(svmla_za32,_s16,_vg2x4,,)(slice_base, zn, zm);
 }
 
 //
 // Multi, single
 // CHECK-LABEL: @test_svmla_single1_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z22test_svmla_single1_f16ju13__SVFloat16_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmla_single1_f16(uint32_t slice_base, svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmla_za32,_f16,_vg2x1,,)(slice_base + 14, zn, zm);
+   SVE_ACLE_FUNC(svmla_za32,_f16,_vg2x1,,)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmla_single1_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z23test_svmla_single1_bf16ju14__SVBfloat16_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmla_single1_bf16(uint32_t slice_base, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmla_za32,_bf16,_vg2x1,,)(slice_base + 14, zn, zm);
+   SVE_ACLE_FUNC(svmla_za32,_bf16,_vg2x1,,)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmla_single1_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.single.vg2x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.single.vg2x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z22test_svmla_single1_u16ju12__SVUint16_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.single.vg2x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.single.vg2x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmla_single1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmla_za32,_u16,_vg2x1,,)(slice_base + 14, zn, zm);
+   SVE_ACLE_FUNC(svmla_za32,_u16,_vg2x1,,)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmla_single1_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.single.vg2x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.single.vg2x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z22test_svmla_single1_s16ju11__SVInt16_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.single.vg2x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.single.vg2x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmla_single1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmla_za32,_s16,_vg2x1,,)(slice_base + 14, zn, zm);
+   SVE_ACLE_FUNC(svmla_za32,_s16,_vg2x1,,)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmla_single2_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z22test_svmla_single2_f16j13svfloat16x2_tu13__SVFloat16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmla_single2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmla,_single,_za32,_f16,_vg2x2)(slice_base + 6, zn, zm);
+   SVE_ACLE_FUNC(svmla,_single,_za32,_f16,_vg2x2)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmla_single2_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z23test_svmla_single2_bf16j14svbfloat16x2_tu14__SVBfloat16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmla_single2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmla,_single,_za32,_bf16,_vg2x2)(slice_base + 6, zn, zm);
+   SVE_ACLE_FUNC(svmla,_single,_za32,_bf16,_vg2x2)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmla_single2_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.single.vg2x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.single.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z22test_svmla_single2_u16j12svuint16x2_tu12__SVUint16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.single.vg2x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.single.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmla_single2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmla,_single,_za32,_u16,_vg2x2)(slice_base + 6, zn, zm);
+   SVE_ACLE_FUNC(svmla,_single,_za32,_u16,_vg2x2)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmla_single2_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.single.vg2x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.single.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z22test_svmla_single2_s16j11svint16x2_tu11__SVInt16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.single.vg2x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.single.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmla_single2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmla,_single,_za32,_s16,_vg2x2)(slice_base + 6, zn, zm);
+   SVE_ACLE_FUNC(svmla,_single,_za32,_s16,_vg2x2)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmla_single4_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z22test_svmla_single4_f16j13svfloat16x4_tu13__SVFloat16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmla_single4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmla,_single,_za32,_f16,_vg2x4)(slice_base + 6, zn, zm);
+   SVE_ACLE_FUNC(svmla,_single,_za32,_f16,_vg2x4)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmla_single4_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z23test_svmla_single4_bf16j14svbfloat16x4_tu14__SVBfloat16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmla_single4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmla,_single,_za32,_bf16,_vg2x4)(slice_base + 6, zn, zm);
+   SVE_ACLE_FUNC(svmla,_single,_za32,_bf16,_vg2x4)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmla_single4_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.single.vg2x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.single.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z22test_svmla_single4_u16j12svuint16x4_tu12__SVUint16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.single.vg2x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.single.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmla_single4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmla,_single,_za32,_u16,_vg2x4)(slice_base + 6, zn, zm);
+   SVE_ACLE_FUNC(svmla,_single,_za32,_u16,_vg2x4)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmla_single4_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.single.vg2x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.single.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z22test_svmla_single4_s16j11svint16x4_tu11__SVInt16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.single.vg2x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.single.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmla_single4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmla,_single,_za32,_s16,_vg2x4)(slice_base + 6, zn, zm);
+   SVE_ACLE_FUNC(svmla,_single,_za32,_s16,_vg2x4)(slice_base, zn, zm);
 }
 
 //
@@ -509,252 +469,228 @@ void test_svmla_single4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) _
 
 // CHECK-LABEL: @test_svmla_lane1_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z20test_svmla_lane1_f16ju13__SVFloat16_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmla_lane1_f16(uint32_t slice_base, svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmla_lane,,_za32,_f16,_vg2x1)(slice_base + 14, zn, zm, 7);
+   SVE_ACLE_FUNC(svmla_lane,,_za32,_f16,_vg2x1)(slice_base, zn, zm, 7);
 }
 
 // CHECK-LABEL: @test_svmla_lane1_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z21test_svmla_lane1_bf16ju14__SVBfloat16_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmla_lane1_bf16(uint32_t slice_base, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmla_lane,,_za32,_bf16,_vg2x1)(slice_base + 14, zn, zm, 7);
+   SVE_ACLE_FUNC(svmla_lane,,_za32,_bf16,_vg2x1)(slice_base, zn, zm, 7);
 }
 
 // CHECK-LABEL: @test_svmla_lane1_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.lane.vg2x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.lane.vg2x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z20test_svmla_lane1_u16ju12__SVUint16_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.lane.vg2x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.lane.vg2x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmla_lane1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmla_lane,,_za32,_u16,_vg2x1)(slice_base + 14, zn, zm, 7);
+   SVE_ACLE_FUNC(svmla_lane,,_za32,_u16,_vg2x1)(slice_base, zn, zm, 7);
 }
 
 // CHECK-LABEL: @test_svmla_lane1_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.lane.vg2x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.lane.vg2x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z20test_svmla_lane1_s16ju11__SVInt16_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.lane.vg2x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.lane.vg2x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmla_lane1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmla_lane,,_za32,_s16,_vg2x1)(slice_base + 14, zn, zm, 7);
+   SVE_ACLE_FUNC(svmla_lane,,_za32,_s16,_vg2x1)(slice_base, zn, zm, 7);
 }
 
 // CHECK-LABEL: @test_svmla_lane2_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z20test_svmla_lane2_f16j13svfloat16x2_tu13__SVFloat16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmla_lane2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmla_lane,,_za32,_f16,_vg2x2)(slice_base + 6, zn, zm, 7);
+   SVE_ACLE_FUNC(svmla_lane,,_za32,_f16,_vg2x2)(slice_base, zn, zm, 7);
 }
 
 // CHECK-LABEL: @test_svmla_lane2_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z21test_svmla_lane2_bf16j14svbfloat16x2_tu14__SVBfloat16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmla_lane2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmla_lane,,_za32,_bf16,_vg2x2)(slice_base + 6, zn, zm, 7);
+   SVE_ACLE_FUNC(svmla_lane,,_za32,_bf16,_vg2x2)(slice_base, zn, zm, 7);
 }
 
 // CHECK-LABEL: @test_svmla_lane2_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.lane.vg2x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.lane.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z20test_svmla_lane2_u16j12svuint16x2_tu12__SVUint16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.lane.vg2x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.lane.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmla_lane2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmla_lane,,_za32,_u16,_vg2x2)(slice_base + 6, zn, zm, 7);
+   SVE_ACLE_FUNC(svmla_lane,,_za32,_u16,_vg2x2)(slice_base, zn, zm, 7);
 }
 
 // CHECK-LABEL: @test_svmla_lane2_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.lane.vg2x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.lane.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z20test_svmla_lane2_s16j11svint16x2_tu11__SVInt16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.lane.vg2x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.lane.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmla_lane2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmla_lane,,_za32,_s16,_vg2x2)(slice_base + 6, zn, zm, 7);
+   SVE_ACLE_FUNC(svmla_lane,,_za32,_s16,_vg2x2)(slice_base, zn, zm, 7);
 }
 
 // CHECK-LABEL: @test_svmla_lane4_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z20test_svmla_lane4_f16j13svfloat16x4_tu13__SVFloat16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmla_lane4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmla_lane,,_za32,_f16,_vg2x4)(slice_base + 6, zn, zm, 7);
+   SVE_ACLE_FUNC(svmla_lane,,_za32,_f16,_vg2x4)(slice_base, zn, zm, 7);
 }
 
 // CHECK-LABEL: @test_svmla_lane4_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z21test_svmla_lane4_bf16j14svbfloat16x4_tu14__SVBfloat16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmla_lane4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmla_lane,,_za32,_bf16,_vg2x4)(slice_base + 6, zn, zm, 7);
+   SVE_ACLE_FUNC(svmla_lane,,_za32,_bf16,_vg2x4)(slice_base, zn, zm, 7);
 }
 
 // CHECK-LABEL: @test_svmla_lane4_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.lane.vg2x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.lane.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z20test_svmla_lane4_u16j12svuint16x4_tu12__SVUint16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.lane.vg2x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.lane.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmla_lane4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmla_lane,,_za32,_u16,_vg2x4)(slice_base + 6, zn, zm, 7);
+   SVE_ACLE_FUNC(svmla_lane,,_za32,_u16,_vg2x4)(slice_base, zn, zm, 7);
 }
 
 // CHECK-LABEL: @test_svmla_lane4_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.lane.vg2x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.lane.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z20test_svmla_lane4_s16j11svint16x4_tu11__SVInt16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.lane.vg2x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.lane.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmla_lane4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmla_lane,,_za32,_s16,_vg2x4)(slice_base + 6, zn, zm, 7);
+   SVE_ACLE_FUNC(svmla_lane,,_za32,_s16,_vg2x4)(slice_base, zn, zm, 7);
 }
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlall.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlall.c
index 7d00ddafb84c5b..44e715e9581b7c 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlall.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlall.c
@@ -23,178 +23,158 @@
 
 // CHECK-LABEL: @test_svmla_single_x1_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.single.vg4x1.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.single.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z23test_svmla_single_x1_s8ju10__SVInt8_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.single.vg4x1.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.single.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmla_single_x1_s8(uint32_t slice_base, svint8_t zn, svint8_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmla_za32,_s8,_vg4x1,,)(slice_base + 12, zn, zm);
+  SVE_ACLE_FUNC(svmla_za32,_s8,_vg4x1,,)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmla_single_x1_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.single.vg4x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.single.vg4x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z24test_svmla_single_x1_s16ju11__SVInt16_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.single.vg4x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.single.vg4x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmla_single_x1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmla_za64,_s16,_vg4x1,,)(slice_base + 12, zn, zm);
+  SVE_ACLE_FUNC(svmla_za64,_s16,_vg4x1,,)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_uvmlal_single_x1_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.single.vg4x1.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.single.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z24test_uvmlal_single_x1_u8ju11__SVUint8_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.single.vg4x1.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.single.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_uvmlal_single_x1_u8(uint32_t slice_base, svuint8_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmla_za32,_u8,_vg4x1,,)(slice_base + 12, zn, zm);
+  SVE_ACLE_FUNC(svmla_za32,_u8,_vg4x1,,)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_uvmlal_single_x1_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.single.vg4x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.single.vg4x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z25test_uvmlal_single_x1_u16ju12__SVUint16_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.single.vg4x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.single.vg4x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_uvmlal_single_x1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmla_za64,_u16,_vg4x1,,)(slice_base + 12, zn, zm);
+  SVE_ACLE_FUNC(svmla_za64,_u16,_vg4x1,,)(slice_base, zn, zm);
 }
 
 // MLSL
 
 // CHECK-LABEL: @test_svmls_single_x1_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.single.vg4x1.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.single.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z23test_svmls_single_x1_s8ju10__SVInt8_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.single.vg4x1.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.single.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmls_single_x1_s8(uint32_t slice_base, svint8_t zn, svint8_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmls_za32,_s8,_vg4x1,,)(slice_base + 12, zn, zm);
+  SVE_ACLE_FUNC(svmls_za32,_s8,_vg4x1,,)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmls_single_x1_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.single.vg4x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.single.vg4x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z24test_svmls_single_x1_s16ju11__SVInt16_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.single.vg4x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.single.vg4x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmls_single_x1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmls_za64,_s16,_vg4x1,,)(slice_base + 12, zn, zm);
+  SVE_ACLE_FUNC(svmls_za64,_s16,_vg4x1,,)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_uvmlsl_single_x1_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.single.vg4x1.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.single.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z24test_uvmlsl_single_x1_u8ju11__SVUint8_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.single.vg4x1.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.single.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_uvmlsl_single_x1_u8(uint32_t slice_base, svuint8_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmls_za32,_u8,_vg4x1,,)(slice_base + 12, zn, zm);
+  SVE_ACLE_FUNC(svmls_za32,_u8,_vg4x1,,)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_uvmlsl_single_x1_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.single.vg4x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.single.vg4x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z25test_uvmlsl_single_x1_u16ju12__SVUint16_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.single.vg4x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.single.vg4x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_uvmlsl_single_x1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmls_za64,_u16,_vg4x1,,)(slice_base + 12, zn, zm);
+  SVE_ACLE_FUNC(svmls_za64,_u16,_vg4x1,,)(slice_base, zn, zm);
 }
 
 // SUMLALL
 
 // CHECK-LABEL: @test_sumlall_single_x1_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.single.vg4x1.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[ZM:%.*]], <vscale x 16 x i8> [[ZN:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.single.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZM:%.*]], <vscale x 16 x i8> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z25test_sumlall_single_x1_s8ju10__SVInt8_tu11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.single.vg4x1.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[ZM:%.*]], <vscale x 16 x i8> [[ZN:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.single.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZM:%.*]], <vscale x 16 x i8> [[ZN:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_sumlall_single_x1_s8(uint32_t slice_base, svint8_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svsumla_za32,_s8,_vg4x1,,)(slice_base + 12, zn, zm);
+  SVE_ACLE_FUNC(svsumla_za32,_s8,_vg4x1,,)(slice_base, zn, zm);
 }
 
 // USMLALL
 
 // CHECK-LABEL: @test_usmlall_single_x1_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.single.vg4x1.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.single.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z25test_usmlall_single_x1_u8ju11__SVUint8_tu10__SVInt8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.single.vg4x1.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.single.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_usmlall_single_x1_u8(uint32_t slice_base, svuint8_t zn, svint8_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svusmla_za32,_u8,_vg4x1,,)(slice_base + 12, zn, zm);
+  SVE_ACLE_FUNC(svusmla_za32,_u8,_vg4x1,,)(slice_base, zn, zm);
 }
 
 //
@@ -205,218 +185,198 @@ void test_usmlall_single_x1_u8(uint32_t slice_base, svuint8_t zn, svint8_t zm) _
 
 // CHECK-LABEL: @test_svmla_single_x2_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.single.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.single.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z23test_svmla_single_x2_s8j10svint8x2_tu10__SVInt8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.single.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.single.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmla_single_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmla,_single,_za32,_s8,_vg4x2)(slice_base + 4, zn, zm);
+  SVE_ACLE_FUNC(svmla,_single,_za32,_s8,_vg4x2)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmla_single_x2_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.single.vg4x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.single.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z24test_svmla_single_x2_s16j11svint16x2_tu11__SVInt16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.single.vg4x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.single.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmla_single_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmla,_single,_za64,_s16,_vg4x2)(slice_base + 4, zn, zm);
+  SVE_ACLE_FUNC(svmla,_single,_za64,_s16,_vg4x2)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmla_single_x2_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.single.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.single.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z23test_svmla_single_x2_u8j11svuint8x2_tu11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.single.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.single.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmla_single_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmla,_single,_za32,_u8,_vg4x2)(slice_base + 4, zn, zm);
+  SVE_ACLE_FUNC(svmla,_single,_za32,_u8,_vg4x2)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmla_single_x2_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.single.vg4x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.single.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z24test_svmla_single_x2_u16j12svuint16x2_tu12__SVUint16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.single.vg4x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.single.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmla_single_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmla,_single,_za64,_u16,_vg4x2)(slice_base + 4, zn, zm);
+  SVE_ACLE_FUNC(svmla,_single,_za64,_u16,_vg4x2)(slice_base, zn, zm);
 }
 
 // MLSL
 
 // CHECK-LABEL: @test_svmls_single_x2_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.single.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.single.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z23test_svmls_single_x2_s8j10svint8x2_tu10__SVInt8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.single.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.single.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmls_single_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmls,_single,_za32,_s8,_vg4x2)(slice_base + 4, zn, zm);
+  SVE_ACLE_FUNC(svmls,_single,_za32,_s8,_vg4x2)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmls_single_x2_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.single.vg4x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.single.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z24test_svmls_single_x2_s16j11svint16x2_tu11__SVInt16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.single.vg4x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.single.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmls_single_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmls,_single,_za64,_s16,_vg4x2)(slice_base + 4, zn, zm);
+  SVE_ACLE_FUNC(svmls,_single,_za64,_s16,_vg4x2)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmls_single_x2_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.single.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.single.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z23test_svmls_single_x2_u8j11svuint8x2_tu11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.single.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.single.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmls_single_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmls,_single,_za32,_u8,_vg4x2)(slice_base + 4, zn, zm);
+  SVE_ACLE_FUNC(svmls,_single,_za32,_u8,_vg4x2)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmls_single_x2_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.single.vg4x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.single.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z24test_svmls_single_x2_u16j12svuint16x2_tu12__SVUint16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.single.vg4x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.single.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmls_single_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmls,_single,_za64,_u16,_vg4x2)(slice_base + 4, zn, zm);
+  SVE_ACLE_FUNC(svmls,_single,_za64,_u16,_vg4x2)(slice_base, zn, zm);
 }
 
 // SUMLALL
 
 // CHECK-LABEL: @test_svsumla_single_x2_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumla.za32.single.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumla.za32.single.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z25test_svsumla_single_x2_s8j10svint8x2_tu11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumla.za32.single.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumla.za32.single.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svsumla_single_x2_s8(uint32_t slice_base, svint8x2_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svsumla,_single,_za32,_s8,_vg4x2)(slice_base + 4, zn, zm);
+  SVE_ACLE_FUNC(svsumla,_single,_za32,_s8,_vg4x2)(slice_base, zn, zm);
 }
 
 // USMLALL
 
 // CHECK-LABEL: @test_usmlall_single_x2_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.single.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.single.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z25test_usmlall_single_x2_u8j11svuint8x2_tu10__SVInt8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.single.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.single.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_usmlall_single_x2_u8(uint32_t slice_base, svuint8x2_t zn, svint8_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svusmla,_single,_za32,_u8,_vg4x2)(slice_base + 4, zn, zm);
+  SVE_ACLE_FUNC(svusmla,_single,_za32,_u8,_vg4x2)(slice_base, zn, zm);
 }
 
 //
@@ -427,258 +387,238 @@ void test_usmlall_single_x2_u8(uint32_t slice_base, svuint8x2_t zn, svint8_t zm)
 
 // CHECK-LABEL: @test_svmla_single_x4_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.single.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.single.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z23test_svmla_single_x4_s8j10svint8x4_tu10__SVInt8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.single.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.single.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmla_single_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmla,_single,_za32,_s8,_vg4x4)(slice_base + 4, zn, zm);
+  SVE_ACLE_FUNC(svmla,_single,_za32,_s8,_vg4x4)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmla_single_x4_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.single.vg4x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.single.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z24test_svmla_single_x4_s16j11svint16x4_tu11__SVInt16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.single.vg4x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.single.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmla_single_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmla,_single,_za64,_s16,_vg4x4)(slice_base + 4, zn, zm);
+  SVE_ACLE_FUNC(svmla,_single,_za64,_s16,_vg4x4)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmla_single_x4_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.single.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.single.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z23test_svmla_single_x4_u8j11svuint8x4_tu11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.single.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.single.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmla_single_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmla,_single,_za32,_u8,_vg4x4)(slice_base + 4, zn, zm);
+  SVE_ACLE_FUNC(svmla,_single,_za32,_u8,_vg4x4)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmla_single_x4_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.single.vg4x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.single.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z24test_svmla_single_x4_u16j12svuint16x4_tu12__SVUint16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.single.vg4x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.single.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmla_single_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmla,_single,_za64,_u16,_vg4x4)(slice_base + 4, zn, zm);
+  SVE_ACLE_FUNC(svmla,_single,_za64,_u16,_vg4x4)(slice_base, zn, zm);
 }
 
 // MLSL
 
 // CHECK-LABEL: @test_svmls_single_x4_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.single.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.single.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z23test_svmls_single_x4_s8j10svint8x4_tu10__SVInt8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.single.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.single.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmls_single_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmls,_single,_za32,_s8,_vg4x4)(slice_base + 4, zn, zm);
+  SVE_ACLE_FUNC(svmls,_single,_za32,_s8,_vg4x4)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmls_single_x4_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.single.vg4x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.single.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z24test_svmls_single_x4_s16j11svint16x4_tu11__SVInt16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.single.vg4x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.single.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmls_single_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmls,_single,_za64,_s16,_vg4x4)(slice_base + 4, zn, zm);
+  SVE_ACLE_FUNC(svmls,_single,_za64,_s16,_vg4x4)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmls_single_x4_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.single.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.single.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z23test_svmls_single_x4_u8j11svuint8x4_tu11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.single.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.single.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmls_single_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmls,_single,_za32,_u8,_vg4x4)(slice_base + 4, zn, zm);
+  SVE_ACLE_FUNC(svmls,_single,_za32,_u8,_vg4x4)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmls_single_x4_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.single.vg4x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.single.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z24test_svmls_single_x4_u16j12svuint16x4_tu12__SVUint16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.single.vg4x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.single.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmls_single_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmls,_single,_za64,_u16,_vg4x4)(slice_base + 4, zn, zm);
+  SVE_ACLE_FUNC(svmls,_single,_za64,_u16,_vg4x4)(slice_base, zn, zm);
 }
 
 // SUMLALL
 
 // CHECK-LABEL: @test_svsumla_single_x4_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumla.za32.single.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumla.za32.single.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z25test_svsumla_single_x4_s8j10svint8x4_tu11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumla.za32.single.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumla.za32.single.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svsumla_single_x4_s8(uint32_t slice_base, svint8x4_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svsumla,_single,_za32,_s8,_vg4x4)(slice_base + 4, zn, zm);
+  SVE_ACLE_FUNC(svsumla,_single,_za32,_s8,_vg4x4)(slice_base, zn, zm);
 }
 
 // USMLALL
 
 // CHECK-LABEL: @test_usmlall_single_x4_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.single.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.single.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z25test_usmlall_single_x4_u8j11svuint8x4_tu10__SVInt8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.single.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.single.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_usmlall_single_x4_u8(uint32_t slice_base, svuint8x4_t zn, svint8_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svusmla,_single,_za32,_u8,_vg4x4)(slice_base + 4, zn, zm);
+  SVE_ACLE_FUNC(svusmla,_single,_za32,_u8,_vg4x4)(slice_base, zn, zm);
 }
 
 //
@@ -689,258 +629,238 @@ void test_usmlall_single_x4_u8(uint32_t slice_base, svuint8x4_t zn, svint8_t zm)
 
 // CHECK-LABEL: @test_mlal_multi_x2_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z21test_mlal_multi_x2_s8j10svint8x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_mlal_multi_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8x2_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmla_za32,_s8,_vg4x2,,)(slice_base + 4, zn, zm);
+  SVE_ACLE_FUNC(svmla_za32,_s8,_vg4x2,,)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_mlal_multi_x2_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.vg4x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z22test_mlal_multi_x2_s16j11svint16x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.vg4x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_mlal_multi_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16x2_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmla_za64,_s16,_vg4x2,,)(slice_base + 4, zn, zm);
+  SVE_ACLE_FUNC(svmla_za64,_s16,_vg4x2,,)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_mlal_multi_x2_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z21test_mlal_multi_x2_u8j11svuint8x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_mlal_multi_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8x2_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmla_za32,_u8,_vg4x2,,)(slice_base + 4, zn, zm);
+  SVE_ACLE_FUNC(svmla_za32,_u8,_vg4x2,,)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_mlal_multi_x2_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.vg4x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z22test_mlal_multi_x2_u16j12svuint16x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.vg4x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_mlal_multi_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16x2_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmla_za64,_u16,_vg4x2,,)(slice_base + 4, zn, zm);
+  SVE_ACLE_FUNC(svmla_za64,_u16,_vg4x2,,)(slice_base, zn, zm);
 }
 
 // MLSL
 
 // CHECK-LABEL: @test_mlsl_multi_x2_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z21test_mlsl_multi_x2_s8j10svint8x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_mlsl_multi_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8x2_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmls_za32,_s8,_vg4x2,,)(slice_base + 4, zn, zm);
+  SVE_ACLE_FUNC(svmls_za32,_s8,_vg4x2,,)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_mlsl_multi_x2_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.vg4x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z22test_mlsl_multi_x2_s16j11svint16x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.vg4x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_mlsl_multi_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16x2_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmls_za64,_s16,_vg4x2,,)(slice_base + 4, zn, zm);
+  SVE_ACLE_FUNC(svmls_za64,_s16,_vg4x2,,)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_mlsl_multi_x2_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z21test_mlsl_multi_x2_u8j11svuint8x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_mlsl_multi_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8x2_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmls_za32,_u8,_vg4x2,,)(slice_base + 4, zn, zm);
+  SVE_ACLE_FUNC(svmls_za32,_u8,_vg4x2,,)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_mlsl_multi_x2_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.vg4x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z22test_mlsl_multi_x2_u16j12svuint16x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.vg4x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_mlsl_multi_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16x2_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmls_za64,_u16,_vg4x2,,)(slice_base + 4, zn, zm);
+  SVE_ACLE_FUNC(svmls_za64,_u16,_vg4x2,,)(slice_base, zn, zm);
 }
 
 // SUMLALL
 
 // CHECK-LABEL: @test_sumlal_multi_x2_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z23test_sumlal_multi_x2_s8j10svint8x2_t11svuint8x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_sumlal_multi_x2_s8(uint32_t slice_base, svint8x2_t zn, svuint8x2_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svsumla_za32,_s8,_vg4x2,,)(slice_base + 4, zn, zm);
+  SVE_ACLE_FUNC(svsumla_za32,_s8,_vg4x2,,)(slice_base, zn, zm);
 }
 
 // USMLALL
 
 // CHECK-LABEL: @test_usmlal_multi_x2_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z23test_usmlal_multi_x2_u8j11svuint8x2_t10svint8x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_usmlal_multi_x2_u8(uint32_t slice_base, svuint8x2_t zn, svint8x2_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svusmla_za32,_u8,_vg4x2,,)(slice_base + 4, zn, zm);
+  SVE_ACLE_FUNC(svusmla_za32,_u8,_vg4x2,,)(slice_base, zn, zm);
 }
 
 //
@@ -951,7 +871,6 @@ void test_usmlal_multi_x2_u8(uint32_t slice_base, svuint8x2_t zn, svint8x2_t zm)
 
 // CHECK-LABEL: @test_mlal_multi_x4_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
@@ -960,12 +879,11 @@ void test_usmlal_multi_x2_u8(uint32_t slice_base, svuint8x2_t zn, svint8x2_t zm)
 // CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
 // CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
 // CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z21test_mlal_multi_x4_s8j10svint8x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
@@ -974,17 +892,16 @@ void test_usmlal_multi_x2_u8(uint32_t slice_base, svuint8x2_t zn, svint8x2_t zm)
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
 // CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_mlal_multi_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8x4_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmla_za32,_s8,_vg4x4,,)(slice_base + 4, zn, zm);
+  SVE_ACLE_FUNC(svmla_za32,_s8,_vg4x4,,)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_mlal_multi_x4_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
@@ -993,12 +910,11 @@ void test_mlal_multi_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8x4_t zm) __
 // CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
 // CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
 // CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.vg4x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z22test_mlal_multi_x4_s16j11svint16x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
@@ -1007,17 +923,16 @@ void test_mlal_multi_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8x4_t zm) __
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.vg4x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_mlal_multi_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16x4_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmla_za64,_s16,_vg4x4,,)(slice_base + 4, zn, zm);
+  SVE_ACLE_FUNC(svmla_za64,_s16,_vg4x4,,)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_mlal_multi_x4_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
@@ -1026,12 +941,11 @@ void test_mlal_multi_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16x4_t zm)
 // CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
 // CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
 // CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z21test_mlal_multi_x4_u8j11svuint8x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
@@ -1040,17 +954,16 @@ void test_mlal_multi_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16x4_t zm)
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
 // CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_mlal_multi_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8x4_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmla_za32,_u8,_vg4x4,,)(slice_base + 4, zn, zm);
+  SVE_ACLE_FUNC(svmla_za32,_u8,_vg4x4,,)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_mlal_multi_x4_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
@@ -1059,12 +972,11 @@ void test_mlal_multi_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8x4_t zm)
 // CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
 // CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
 // CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.vg4x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z22test_mlal_multi_x4_u16j12svuint16x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
@@ -1073,19 +985,18 @@ void test_mlal_multi_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8x4_t zm)
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.vg4x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_mlal_multi_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmla_za64,_u16,_vg4x4,,)(slice_base + 4, zn, zm);
+  SVE_ACLE_FUNC(svmla_za64,_u16,_vg4x4,,)(slice_base, zn, zm);
 }
 
 // MLSL
 
 // CHECK-LABEL: @test_mlsl_multi_x4_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
@@ -1094,12 +1005,11 @@ void test_mlal_multi_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t z
 // CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
 // CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
 // CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z21test_mlsl_multi_x4_s8j10svint8x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
@@ -1108,17 +1018,16 @@ void test_mlal_multi_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t z
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
 // CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_mlsl_multi_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8x4_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmls_za32,_s8,_vg4x4,,)(slice_base + 4, zn, zm);
+  SVE_ACLE_FUNC(svmls_za32,_s8,_vg4x4,,)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_mlsl_multi_x4_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
@@ -1127,12 +1036,11 @@ void test_mlsl_multi_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8x4_t zm) __
 // CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
 // CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
 // CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.vg4x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z22test_mlsl_multi_x4_s16j11svint16x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
@@ -1141,17 +1049,16 @@ void test_mlsl_multi_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8x4_t zm) __
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.vg4x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_mlsl_multi_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16x4_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmls_za64,_s16,_vg4x4,,)(slice_base + 4, zn, zm);
+  SVE_ACLE_FUNC(svmls_za64,_s16,_vg4x4,,)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_mlsl_multi_x4_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
@@ -1160,12 +1067,11 @@ void test_mlsl_multi_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16x4_t zm)
 // CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
 // CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
 // CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z21test_mlsl_multi_x4_u8j11svuint8x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
@@ -1174,17 +1080,16 @@ void test_mlsl_multi_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16x4_t zm)
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
 // CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_mlsl_multi_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8x4_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmls_za32,_u8,_vg4x4,,)(slice_base + 4, zn, zm);
+  SVE_ACLE_FUNC(svmls_za32,_u8,_vg4x4,,)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_mlsl_multi_x4_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
@@ -1193,12 +1098,11 @@ void test_mlsl_multi_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8x4_t zm)
 // CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
 // CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
 // CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.vg4x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z22test_mlsl_multi_x4_u16j12svuint16x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
@@ -1207,19 +1111,18 @@ void test_mlsl_multi_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8x4_t zm)
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.vg4x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_mlsl_multi_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmls_za64,_u16,_vg4x4,,)(slice_base + 4, zn, zm);
+  SVE_ACLE_FUNC(svmls_za64,_u16,_vg4x4,,)(slice_base, zn, zm);
 }
 
 // SUMLALL
 
 // CHECK-LABEL: @test_sumlal_multi_x4_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
@@ -1228,12 +1131,11 @@ void test_mlsl_multi_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t z
 // CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
 // CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
 // CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z23test_sumlal_multi_x4_s8j10svint8x4_t11svuint8x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
@@ -1242,19 +1144,18 @@ void test_mlsl_multi_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t z
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
 // CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_sumlal_multi_x4_s8(uint32_t slice_base, svint8x4_t zn, svuint8x4_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svsumla_za32,_s8,_vg4x4,,)(slice_base + 4, zn, zm);
+  SVE_ACLE_FUNC(svsumla_za32,_s8,_vg4x4,,)(slice_base, zn, zm);
 }
 
 // USMLALL
 
 // CHECK-LABEL: @test_usmlal_multi_x4_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
@@ -1263,12 +1164,11 @@ void test_sumlal_multi_x4_s8(uint32_t slice_base, svint8x4_t zn, svuint8x4_t zm)
 // CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
 // CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
 // CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z23test_usmlal_multi_x4_u8j11svuint8x4_t10svint8x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
@@ -1277,12 +1177,12 @@ void test_sumlal_multi_x4_s8(uint32_t slice_base, svint8x4_t zn, svuint8x4_t zm)
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
 // CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_usmlal_multi_x4_u8(uint32_t slice_base, svuint8x4_t zn, svint8x4_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svusmla_za32,_u8,_vg4x4,,)(slice_base + 4, zn, zm);
+  SVE_ACLE_FUNC(svusmla_za32,_u8,_vg4x4,,)(slice_base, zn, zm);
 }
 
 //
@@ -1293,178 +1193,158 @@ void test_usmlal_multi_x4_u8(uint32_t slice_base, svuint8x4_t zn, svint8x4_t zm)
 
 // CHECK-LABEL: @test_smlal_lane_x1_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.lane.vg4x1.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.lane.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z21test_smlal_lane_x1_s8ju10__SVInt8_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.lane.vg4x1.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.lane.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_smlal_lane_x1_s8(uint32_t slice_base, svint8_t zn, svint8_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmla_lane_za32,_s8,_vg4x1,,)(slice_base + 12, zn, zm, 15);
+  SVE_ACLE_FUNC(svmla_lane_za32,_s8,_vg4x1,,)(slice_base, zn, zm, 15);
 }
 
 // CHECK-LABEL: @test_smlal_lane_x1_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.lane.vg4x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.lane.vg4x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z22test_smlal_lane_x1_s16ju11__SVInt16_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.lane.vg4x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.lane.vg4x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_smlal_lane_x1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmla_lane_za64,_s16,_vg4x1,,)(slice_base + 12, zn, zm, 7);
+  SVE_ACLE_FUNC(svmla_lane_za64,_s16,_vg4x1,,)(slice_base, zn, zm, 7);
 }
 
 // CHECK-LABEL: @test_smlal_lane_x1_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.lane.vg4x1.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.lane.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z21test_smlal_lane_x1_u8ju11__SVUint8_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.lane.vg4x1.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.lane.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_smlal_lane_x1_u8(uint32_t slice_base, svuint8_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmla_lane_za32,_u8,_vg4x1,,)(slice_base + 12, zn, zm, 15);
+  SVE_ACLE_FUNC(svmla_lane_za32,_u8,_vg4x1,,)(slice_base, zn, zm, 15);
 }
 
 // CHECK-LABEL: @test_smlal_lane_x1_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.lane.vg4x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.lane.vg4x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z22test_smlal_lane_x1_u16ju12__SVUint16_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.lane.vg4x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.lane.vg4x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_smlal_lane_x1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmla_lane_za64,_u16,_vg4x1,,)(slice_base + 12, zn, zm, 7);
+  SVE_ACLE_FUNC(svmla_lane_za64,_u16,_vg4x1,,)(slice_base, zn, zm, 7);
 }
 
 // SMLSL
 
 // CHECK-LABEL: @test_smlsl_lane_x1_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.lane.vg4x1.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.lane.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z21test_smlsl_lane_x1_s8ju10__SVInt8_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.lane.vg4x1.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.lane.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_smlsl_lane_x1_s8(uint32_t slice_base, svint8_t zn, svint8_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmls_lane_za32,_s8,_vg4x1,,)(slice_base + 12, zn, zm, 15);
+  SVE_ACLE_FUNC(svmls_lane_za32,_s8,_vg4x1,,)(slice_base, zn, zm, 15);
 }
 
 // CHECK-LABEL: @test_smlsl_lane_x1_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.lane.vg4x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.lane.vg4x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z22test_smlsl_lane_x1_s16ju11__SVInt16_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.lane.vg4x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.lane.vg4x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_smlsl_lane_x1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmls_lane_za64,_s16,_vg4x1,,)(slice_base + 12, zn, zm, 7);
+  SVE_ACLE_FUNC(svmls_lane_za64,_s16,_vg4x1,,)(slice_base, zn, zm, 7);
 }
 
 // CHECK-LABEL: @test_smlsl_lane_x1_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.lane.vg4x1.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.lane.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z21test_smlsl_lane_x1_u8ju11__SVUint8_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.lane.vg4x1.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.lane.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_smlsl_lane_x1_u8(uint32_t slice_base, svuint8_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmls_lane_za32,_u8,_vg4x1,,)(slice_base + 12, zn, zm, 15);
+  SVE_ACLE_FUNC(svmls_lane_za32,_u8,_vg4x1,,)(slice_base, zn, zm, 15);
 }
 
 // CHECK-LABEL: @test_smlsl_lane_x1_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.lane.vg4x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.lane.vg4x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z22test_smlsl_lane_x1_u16ju12__SVUint16_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.lane.vg4x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.lane.vg4x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_smlsl_lane_x1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmls_lane_za64,_u16,_vg4x1,,)(slice_base + 12, zn, zm, 7);
+  SVE_ACLE_FUNC(svmls_lane_za64,_u16,_vg4x1,,)(slice_base, zn, zm, 7);
 }
 
 // SUMLALL
 
 // CHECK-LABEL: @test_sumlall_lane_x1_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumla.za32.lane.vg4x1.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumla.za32.lane.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z23test_sumlall_lane_x1_s8ju10__SVInt8_tu11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumla.za32.lane.vg4x1.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumla.za32.lane.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_sumlall_lane_x1_s8(uint32_t slice_base, svint8_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svsumla_lane_za32,_s8,_vg4x1,,)(slice_base + 12, zn, zm, 15);
+  SVE_ACLE_FUNC(svsumla_lane_za32,_s8,_vg4x1,,)(slice_base, zn, zm, 15);
 }
 
 // USMLALL
 
 // CHECK-LABEL: @test_usmlall_lane_x1_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.lane.vg4x1.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.lane.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z23test_usmlall_lane_x1_u8ju11__SVUint8_tu10__SVInt8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 12
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.lane.vg4x1.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.lane.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_usmlall_lane_x1_u8(uint32_t slice_base, svuint8_t zn, svint8_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svusmla_lane_za32,_u8,_vg4x1,,)(slice_base + 12, zn, zm, 15);
+  SVE_ACLE_FUNC(svusmla_lane_za32,_u8,_vg4x1,,)(slice_base, zn, zm, 15);
 }
 
 //
@@ -1475,216 +1355,196 @@ void test_usmlall_lane_x1_u8(uint32_t slice_base, svuint8_t zn, svint8_t zm) __a
 
 // CHECK-LABEL: @test_smlal_lane_x2_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.lane.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.lane.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z21test_smlal_lane_x2_s8j10svint8x2_tu10__SVInt8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.lane.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.lane.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_smlal_lane_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmla_lane_za32,_s8,_vg4x2,,)(slice_base + 4, zn, zm, 15);
+  SVE_ACLE_FUNC(svmla_lane_za32,_s8,_vg4x2,,)(slice_base, zn, zm, 15);
 }
 
 // CHECK-LABEL: @test_smlal_lane_x2_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.lane.vg4x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.lane.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z22test_smlal_lane_x2_s16j11svint16x2_tu11__SVInt16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.lane.vg4x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.lane.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_smlal_lane_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmla_lane_za64,_s16,_vg4x2,,)(slice_base + 4, zn, zm, 7);
+  SVE_ACLE_FUNC(svmla_lane_za64,_s16,_vg4x2,,)(slice_base, zn, zm, 7);
 }
 
 // CHECK-LABEL: @test_smlal_lane_x2_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.lane.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.lane.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z21test_smlal_lane_x2_u8j11svuint8x2_tu11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.lane.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.lane.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_smlal_lane_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmla_lane_za32,_u8,_vg4x2,,)(slice_base + 4, zn, zm, 15);
+  SVE_ACLE_FUNC(svmla_lane_za32,_u8,_vg4x2,,)(slice_base, zn, zm, 15);
 }
 
 // CHECK-LABEL: @test_smlal_lane_x2_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.lane.vg4x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.lane.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z22test_smlal_lane_x2_u16j12svuint16x2_tu12__SVUint16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.lane.vg4x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.lane.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_smlal_lane_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmla_lane_za64,_u16,_vg4x2,,)(slice_base + 4, zn, zm, 7);
+  SVE_ACLE_FUNC(svmla_lane_za64,_u16,_vg4x2,,)(slice_base, zn, zm, 7);
 }
 
 // SMLSL
 
 // CHECK-LABEL: @test_smlsl_lane_x2_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.lane.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.lane.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z21test_smlsl_lane_x2_s8j10svint8x2_tu10__SVInt8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.lane.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.lane.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_smlsl_lane_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmls_lane_za32,_s8,_vg4x2,,)(slice_base + 4, zn, zm, 15);
+  SVE_ACLE_FUNC(svmls_lane_za32,_s8,_vg4x2,,)(slice_base, zn, zm, 15);
 }
 
 // CHECK-LABEL: @test_smlsl_lane_x2_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.lane.vg4x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.lane.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z22test_smlsl_lane_x2_s16j11svint16x2_tu11__SVInt16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.lane.vg4x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.lane.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_smlsl_lane_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmls_lane_za64,_s16,_vg4x2,,)(slice_base + 4, zn, zm, 7);
+  SVE_ACLE_FUNC(svmls_lane_za64,_s16,_vg4x2,,)(slice_base, zn, zm, 7);
 }
 
 // CHECK-LABEL: @test_smlsl_lane_x2_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.lane.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.lane.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z21test_smlsl_lane_x2_u8j11svuint8x2_tu11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.lane.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.lane.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_smlsl_lane_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmls_lane_za32,_u8,_vg4x2,,)(slice_base + 4, zn, zm, 15);
+  SVE_ACLE_FUNC(svmls_lane_za32,_u8,_vg4x2,,)(slice_base, zn, zm, 15);
 }
 
 // CHECK-LABEL: @test_smlsl_lane_x2_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.lane.vg4x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.lane.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z22test_smlsl_lane_x2_u16j12svuint16x2_tu12__SVUint16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.lane.vg4x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.lane.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_smlsl_lane_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmls_lane_za64,_u16,_vg4x2,,)(slice_base + 4, zn, zm, 7);
+  SVE_ACLE_FUNC(svmls_lane_za64,_u16,_vg4x2,,)(slice_base, zn, zm, 7);
 }
 
 // SUMLALL
 
 // CHECK-LABEL: @test_sumlall_lane_x2_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumla.za32.lane.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumla.za32.lane.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z23test_sumlall_lane_x2_s8j10svint8x2_tu11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumla.za32.lane.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumla.za32.lane.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_sumlall_lane_x2_s8(uint32_t slice_base, svint8x2_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svsumla_lane_za32,_s8,_vg4x2,,)(slice_base + 4, zn, zm, 15);
+  SVE_ACLE_FUNC(svsumla_lane_za32,_s8,_vg4x2,,)(slice_base, zn, zm, 15);
 }
 
 // CHECK-LABEL: @test_usmlall_lane_x2_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.lane.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.lane.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z23test_usmlall_lane_x2_u8j11svuint8x2_tu10__SVInt8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.lane.vg4x2.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.lane.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_usmlall_lane_x2_u8(uint32_t slice_base, svuint8x2_t zn, svint8_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svusmla_lane_za32,_u8,_vg4x2,,)(slice_base + 4, zn, zm, 15);
+  SVE_ACLE_FUNC(svusmla_lane_za32,_u8,_vg4x2,,)(slice_base, zn, zm, 15);
 }
 
 //
@@ -1695,256 +1555,236 @@ void test_usmlall_lane_x2_u8(uint32_t slice_base, svuint8x2_t zn, svint8_t zm) _
 
 // CHECK-LABEL: @test_smlal_lane_x4_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.lane.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.lane.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z21test_smlal_lane_x4_s8j10svint8x4_tu10__SVInt8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.lane.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.lane.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_smlal_lane_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmla_lane_za32,_s8,_vg4x4,,)(slice_base + 4, zn, zm, 15);
+  SVE_ACLE_FUNC(svmla_lane_za32,_s8,_vg4x4,,)(slice_base, zn, zm, 15);
 }
 
 // CHECK-LABEL: @test_smlal_lane_x4_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.lane.vg4x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.lane.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z22test_smlal_lane_x4_s16j11svint16x4_tu11__SVInt16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.lane.vg4x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.lane.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_smlal_lane_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmla_lane_za64,_s16,_vg4x4,,)(slice_base + 4, zn, zm, 7);
+  SVE_ACLE_FUNC(svmla_lane_za64,_s16,_vg4x4,,)(slice_base, zn, zm, 7);
 }
 
 // CHECK-LABEL: @test_smlal_lane_x4_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.lane.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.lane.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z21test_smlal_lane_x4_u8j11svuint8x4_tu11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.lane.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.lane.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_smlal_lane_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmla_lane_za32,_u8,_vg4x4,,)(slice_base + 4, zn, zm, 15);
+  SVE_ACLE_FUNC(svmla_lane_za32,_u8,_vg4x4,,)(slice_base, zn, zm, 15);
 }
 
 // CHECK-LABEL: @test_smlal_lane_x4_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.lane.vg4x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.lane.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z22test_smlal_lane_x4_u16j12svuint16x4_tu12__SVUint16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.lane.vg4x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.lane.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_smlal_lane_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmla_lane_za64,_u16,_vg4x4,,)(slice_base + 4, zn, zm, 7);
+  SVE_ACLE_FUNC(svmla_lane_za64,_u16,_vg4x4,,)(slice_base, zn, zm, 7);
 }
 
 // MLSL
 
 // CHECK-LABEL: @test_smlsl_lane_x4_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.lane.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.lane.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z21test_smlsl_lane_x4_s8j10svint8x4_tu10__SVInt8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.lane.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.lane.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_smlsl_lane_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmls_lane_za32,_s8,_vg4x4,,)(slice_base + 4, zn, zm, 15);
+  SVE_ACLE_FUNC(svmls_lane_za32,_s8,_vg4x4,,)(slice_base, zn, zm, 15);
 }
 
 // CHECK-LABEL: @test_smlsl_lane_x4_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.lane.vg4x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.lane.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z22test_smlsl_lane_x4_s16j11svint16x4_tu11__SVInt16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.lane.vg4x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.lane.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_smlsl_lane_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmls_lane_za64,_s16,_vg4x4,,)(slice_base + 4, zn, zm, 7);
+  SVE_ACLE_FUNC(svmls_lane_za64,_s16,_vg4x4,,)(slice_base, zn, zm, 7);
 }
 
 // CHECK-LABEL: @test_smlsl_lane_x4_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.lane.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.lane.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z21test_smlsl_lane_x4_u8j11svuint8x4_tu11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.lane.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.lane.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_smlsl_lane_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmls_lane_za32,_u8,_vg4x4,,)(slice_base + 4, zn, zm, 15);
+  SVE_ACLE_FUNC(svmls_lane_za32,_u8,_vg4x4,,)(slice_base, zn, zm, 15);
 }
 
 // CHECK-LABEL: @test_smlsl_lane_x4_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.lane.vg4x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.lane.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z22test_smlsl_lane_x4_u16j12svuint16x4_tu12__SVUint16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.lane.vg4x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.lane.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_smlsl_lane_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmls_lane_za64,_u16,_vg4x4,,)(slice_base + 4, zn, zm, 7);
+  SVE_ACLE_FUNC(svmls_lane_za64,_u16,_vg4x4,,)(slice_base, zn, zm, 7);
 }
 
 // SUMLALL
 
 // CHECK-LABEL: @test_sumlall_lane_x4_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumla.za32.lane.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumla.za32.lane.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z23test_sumlall_lane_x4_s8j10svint8x4_tu11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumla.za32.lane.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumla.za32.lane.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_sumlall_lane_x4_s8(uint32_t slice_base, svint8x4_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svsumla_lane_za32,_s8,_vg4x4,,)(slice_base + 4, zn, zm, 15);
+  SVE_ACLE_FUNC(svsumla_lane_za32,_s8,_vg4x4,,)(slice_base, zn, zm, 15);
 }
 
 // USMLALL
 
 // CHECK-LABEL: @test_usmlall_lane_x4_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.lane.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.lane.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z23test_usmlall_lane_x4_s8j11svuint8x4_tu10__SVInt8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 4
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.lane.vg4x4.nxv16i8(i32 [[ADD]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.lane.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_usmlall_lane_x4_s8(uint32_t slice_base, svuint8x4_t zn, svint8_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svusmla_lane_za32,_u8,_vg4x4,,)(slice_base + 4, zn, zm, 15);
+  SVE_ACLE_FUNC(svusmla_lane_za32,_u8,_vg4x4,,)(slice_base, zn, zm, 15);
 }
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlsl.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlsl.c
index e8528271c81d37..1368ffe52c1ea5 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlsl.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlsl.c
@@ -19,107 +19,98 @@
 // Multi, multi
 // CHECK-LABEL: @test_svmls2_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z15test_svmls2_f16j13svfloat16x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmls2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16x2_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmls_za32,_f16,_vg2x2,,)(slice_base + 6, zn, zm);
+   SVE_ACLE_FUNC(svmls_za32,_f16,_vg2x2,,)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmls2_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z16test_svmls2_bf16j14svbfloat16x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmls2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16x2_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmls_za32,_bf16,_vg2x2,,)(slice_base + 6, zn, zm);
+   SVE_ACLE_FUNC(svmls_za32,_bf16,_vg2x2,,)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmls2_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.vg2x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z15test_svmls2_u16j12svuint16x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.vg2x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmls2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16x2_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmls_za32,_u16,_vg2x2,,)(slice_base + 6, zn, zm);
+   SVE_ACLE_FUNC(svmls_za32,_u16,_vg2x2,,)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmls2_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.vg2x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z15test_svmls2_s16j11svint16x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.vg2x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmls2_s16(uint32_t slice_base, svint16x2_t zn, svint16x2_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmls_za32,_s16,_vg2x2,,)(slice_base + 6, zn, zm);
+   SVE_ACLE_FUNC(svmls_za32,_s16,_vg2x2,,)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmls4_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
@@ -128,12 +119,11 @@ void test_svmls2_s16(uint32_t slice_base, svint16x2_t zn, svint16x2_t zm) __arm_
 // CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 8)
 // CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 16)
 // CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 24)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z15test_svmls4_f16j13svfloat16x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
@@ -142,17 +132,16 @@ void test_svmls2_s16(uint32_t slice_base, svint16x2_t zn, svint16x2_t zm) __arm_
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 24)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmls4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16x4_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmls_za32,_f16,_vg2x4,,)(slice_base + 6, zn, zm);
+   SVE_ACLE_FUNC(svmls_za32,_f16,_vg2x4,,)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmls4_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
@@ -161,12 +150,11 @@ void test_svmls4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16x4_t zm) __
 // CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 8)
 // CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 16)
 // CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 24)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z16test_svmls4_bf16j14svbfloat16x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
@@ -175,17 +163,16 @@ void test_svmls4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16x4_t zm) __
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 24)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmls4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16x4_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmls_za32,_bf16,_vg2x4,,)(slice_base + 6, zn, zm);
+   SVE_ACLE_FUNC(svmls_za32,_bf16,_vg2x4,,)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmls4_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
@@ -194,12 +181,11 @@ void test_svmls4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16x4_t zm)
 // CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
 // CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
 // CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.vg2x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z15test_svmls4_u16j12svuint16x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
@@ -208,17 +194,16 @@ void test_svmls4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16x4_t zm)
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.vg2x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmls4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmls_za32,_u16,_vg2x4,,)(slice_base + 6, zn, zm);
+   SVE_ACLE_FUNC(svmls_za32,_u16,_vg2x4,,)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmls4_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
@@ -227,12 +212,11 @@ void test_svmls4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t zm) __ar
 // CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
 // CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
 // CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.vg2x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z15test_svmls4_s16j11svint16x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
@@ -241,266 +225,242 @@ void test_svmls4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t zm) __ar
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.vg2x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmls4_s16(uint32_t slice_base, svint16x4_t zn, svint16x4_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmls_za32,_s16,_vg2x4,,)(slice_base + 6, zn, zm);
+   SVE_ACLE_FUNC(svmls_za32,_s16,_vg2x4,,)(slice_base, zn, zm);
 }
 
 //
 // Multi, single
 // CHECK-LABEL: @test_svmls_single1_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z22test_svmls_single1_f16ju13__SVFloat16_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmls_single1_f16(uint32_t slice_base, svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmls_za32,_f16,_vg2x1,,)(slice_base + 14, zn, zm);
+   SVE_ACLE_FUNC(svmls_za32,_f16,_vg2x1,,)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmls_single1_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z23test_svmls_single1_bf16ju14__SVBfloat16_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmls_single1_bf16(uint32_t slice_base, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmls_za32,_bf16,_vg2x1,,)(slice_base + 14, zn, zm);
+   SVE_ACLE_FUNC(svmls_za32,_bf16,_vg2x1,,)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmls_single1_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.single.vg2x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.single.vg2x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z22test_svmls_single1_u16ju12__SVUint16_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.single.vg2x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.single.vg2x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmls_single1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmls_za32,_u16,_vg2x1,,)(slice_base + 14, zn, zm);
+   SVE_ACLE_FUNC(svmls_za32,_u16,_vg2x1,,)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmls_single1_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.single.vg2x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.single.vg2x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z22test_svmls_single1_s16ju11__SVInt16_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.single.vg2x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.single.vg2x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmls_single1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmls_za32,_s16,_vg2x1,,)(slice_base + 14, zn, zm);
+   SVE_ACLE_FUNC(svmls_za32,_s16,_vg2x1,,)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmls_single2_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z22test_svmls_single2_f16j13svfloat16x2_tu13__SVFloat16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmls_single2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmls,_single,_za32,_f16,_vg2x2)(slice_base + 6, zn, zm);
+   SVE_ACLE_FUNC(svmls,_single,_za32,_f16,_vg2x2)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmls_single2_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z23test_svmls_single2_bf16j14svbfloat16x2_tu14__SVBfloat16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmls_single2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmls,_single,_za32,_bf16,_vg2x2)(slice_base + 6, zn, zm);
+   SVE_ACLE_FUNC(svmls,_single,_za32,_bf16,_vg2x2)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmls_single2_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.single.vg2x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.single.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z22test_svmls_single2_u16j12svuint16x2_tu12__SVUint16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.single.vg2x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.single.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmls_single2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmls,_single,_za32,_u16,_vg2x2)(slice_base + 6, zn, zm);
+   SVE_ACLE_FUNC(svmls,_single,_za32,_u16,_vg2x2)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmls_single2_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.single.vg2x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.single.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z22test_svmls_single2_s16j11svint16x2_tu11__SVInt16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.single.vg2x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.single.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmls_single2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmls,_single,_za32,_s16,_vg2x2)(slice_base + 6, zn, zm);
+   SVE_ACLE_FUNC(svmls,_single,_za32,_s16,_vg2x2)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmls_single4_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z22test_svmls_single4_f16j13svfloat16x4_tu13__SVFloat16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmls_single4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmls,_single,_za32,_f16,_vg2x4)(slice_base + 6, zn, zm);
+   SVE_ACLE_FUNC(svmls,_single,_za32,_f16,_vg2x4)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmls_single4_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z23test_svmls_single4_bf16j14svbfloat16x4_tu14__SVBfloat16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmls_single4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmls,_single,_za32,_bf16,_vg2x4)(slice_base + 6, zn, zm);
+   SVE_ACLE_FUNC(svmls,_single,_za32,_bf16,_vg2x4)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmls_single4_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.single.vg2x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.single.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z22test_svmls_single4_u16j12svuint16x4_tu12__SVUint16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.single.vg2x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.single.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmls_single4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmls,_single,_za32,_u16,_vg2x4)(slice_base + 6, zn, zm);
+   SVE_ACLE_FUNC(svmls,_single,_za32,_u16,_vg2x4)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmls_single4_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.single.vg2x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.single.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z22test_svmls_single4_s16j11svint16x4_tu11__SVInt16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.single.vg2x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.single.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmls_single4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmls,_single,_za32,_s16,_vg2x4)(slice_base + 6, zn, zm);
+   SVE_ACLE_FUNC(svmls,_single,_za32,_s16,_vg2x4)(slice_base, zn, zm);
 }
 
 //
@@ -509,252 +469,228 @@ void test_svmls_single4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) _
 
 // CHECK-LABEL: @test_svmls_lane1_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z20test_svmls_lane1_f16ju13__SVFloat16_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmls_lane1_f16(uint32_t slice_base, svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmls_lane,,_za32,_f16,_vg2x1)(slice_base + 14, zn, zm, 7);
+   SVE_ACLE_FUNC(svmls_lane,,_za32,_f16,_vg2x1)(slice_base, zn, zm, 7);
 }
 
 // CHECK-LABEL: @test_svmls_lane1_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z21test_svmls_lane1_bf16ju14__SVBfloat16_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmls_lane1_bf16(uint32_t slice_base, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmls_lane,,_za32,_bf16,_vg2x1)(slice_base + 14, zn, zm, 7);
+   SVE_ACLE_FUNC(svmls_lane,,_za32,_bf16,_vg2x1)(slice_base, zn, zm, 7);
 }
 
 // CHECK-LABEL: @test_svmls_lane1_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.lane.vg2x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.lane.vg2x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z20test_svmls_lane1_u16ju12__SVUint16_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.lane.vg2x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.lane.vg2x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmls_lane1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmls_lane,,_za32,_u16,_vg2x1)(slice_base + 14, zn, zm, 7);
+   SVE_ACLE_FUNC(svmls_lane,,_za32,_u16,_vg2x1)(slice_base, zn, zm, 7);
 }
 
 // CHECK-LABEL: @test_svmls_lane1_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.lane.vg2x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.lane.vg2x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z20test_svmls_lane1_s16ju11__SVInt16_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 14
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.lane.vg2x1.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.lane.vg2x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmls_lane1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmls_lane,,_za32,_s16,_vg2x1)(slice_base + 14, zn, zm, 7);
+   SVE_ACLE_FUNC(svmls_lane,,_za32,_s16,_vg2x1)(slice_base, zn, zm, 7);
 }
 
 // CHECK-LABEL: @test_svmls_lane2_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z20test_svmls_lane2_f16j13svfloat16x2_tu13__SVFloat16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmls_lane2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmls_lane,,_za32,_f16,_vg2x2)(slice_base + 6, zn, zm, 7);
+   SVE_ACLE_FUNC(svmls_lane,,_za32,_f16,_vg2x2)(slice_base, zn, zm, 7);
 }
 
 // CHECK-LABEL: @test_svmls_lane2_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z21test_svmls_lane2_bf16j14svbfloat16x2_tu14__SVBfloat16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmls_lane2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmls_lane,,_za32,_bf16,_vg2x2)(slice_base + 6, zn, zm, 7);
+   SVE_ACLE_FUNC(svmls_lane,,_za32,_bf16,_vg2x2)(slice_base, zn, zm, 7);
 }
 
 // CHECK-LABEL: @test_svmls_lane2_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.lane.vg2x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.lane.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z20test_svmls_lane2_u16j12svuint16x2_tu12__SVUint16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.lane.vg2x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.lane.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmls_lane2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmls_lane,,_za32,_u16,_vg2x2)(slice_base + 6, zn, zm, 7);
+   SVE_ACLE_FUNC(svmls_lane,,_za32,_u16,_vg2x2)(slice_base, zn, zm, 7);
 }
 
 // CHECK-LABEL: @test_svmls_lane2_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.lane.vg2x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.lane.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z20test_svmls_lane2_s16j11svint16x2_tu11__SVInt16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.lane.vg2x2.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.lane.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmls_lane2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmls_lane,,_za32,_s16,_vg2x2)(slice_base + 6, zn, zm, 7);
+   SVE_ACLE_FUNC(svmls_lane,,_za32,_s16,_vg2x2)(slice_base, zn, zm, 7);
 }
 
 // CHECK-LABEL: @test_svmls_lane4_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z20test_svmls_lane4_f16j13svfloat16x4_tu13__SVFloat16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8f16(i32 [[ADD]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmls_lane4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmls_lane,,_za32,_f16,_vg2x4)(slice_base + 6, zn, zm, 7);
+   SVE_ACLE_FUNC(svmls_lane,,_za32,_f16,_vg2x4)(slice_base, zn, zm, 7);
 }
 
 // CHECK-LABEL: @test_svmls_lane4_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z21test_svmls_lane4_bf16j14svbfloat16x4_tu14__SVBfloat16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8bf16(i32 [[ADD]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmls_lane4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmls_lane,,_za32,_bf16,_vg2x4)(slice_base + 6, zn, zm, 7);
+   SVE_ACLE_FUNC(svmls_lane,,_za32,_bf16,_vg2x4)(slice_base, zn, zm, 7);
 }
 
 // CHECK-LABEL: @test_svmls_lane4_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.lane.vg2x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.lane.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z20test_svmls_lane4_u16j12svuint16x4_tu12__SVUint16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.lane.vg2x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.lane.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmls_lane4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmls_lane,,_za32,_u16,_vg2x4)(slice_base + 6, zn, zm, 7);
+   SVE_ACLE_FUNC(svmls_lane,,_za32,_u16,_vg2x4)(slice_base, zn, zm, 7);
 }
 
 // CHECK-LABEL: @test_svmls_lane4_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.lane.vg2x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.lane.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z20test_svmls_lane4_s16j11svint16x4_tu11__SVInt16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE:%.*]], 6
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.lane.vg2x4.nxv8i16(i32 [[ADD]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.lane.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmls_lane4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmls_lane,,_za32,_s16,_vg2x4)(slice_base + 6, zn, zm, 7);
+   SVE_ACLE_FUNC(svmls_lane,,_za32,_s16,_vg2x4)(slice_base, zn, zm, 7);
 }

>From e176df7b6a91d6f8168f892026c727b5892900b6 Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <Dinar.Temirbulatov at arm.com>
Date: Fri, 15 Dec 2023 14:25:09 +0000
Subject: [PATCH 4/7] Corrected TargetGuard, since those are not nested.

---
 clang/include/clang/Basic/arm_sme.td | 96 +++++++++++++---------------
 1 file changed, 46 insertions(+), 50 deletions(-)

diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index bda493f58441a8..2a9bc6870bf713 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -404,13 +404,6 @@ let TargetGuard = "sme2" in {
   def SVMLAL_MULTI_VG4x4_S8 : Inst<"svmla_za32[_{d}]_vg4x4", "vm44", "c", MergeNone, "aarch64_sme_smla_za32_vg4x4", [IsStreaming, IsSharedZA], []>;
   def SVMLAL_MULTI_VG4x4_U8 : Inst<"svmla_za32[_{d}]_vg4x4", "vm44", "Uc", MergeNone, "aarch64_sme_umla_za32_vg4x4", [IsStreaming, IsSharedZA], []>;
 
-  let TargetGuard = "sme-i16i64" in {
-    def SVMLAL_MULTI_VG4x2_S16 : Inst<"svmla_za64[_{d}]_vg4x2", "vm22", "s", MergeNone, "aarch64_sme_smla_za64_vg4x2", [IsStreaming, IsSharedZA], []>;
-    def SVMLAL_MULTI_VG4x2_U16 : Inst<"svmla_za64[_{d}]_vg4x2", "vm22", "Us", MergeNone, "aarch64_sme_umla_za64_vg4x2", [IsStreaming, IsSharedZA], []>;
-    def SVMLAL_MULTI_VG4x4_S16 : Inst<"svmla_za64[_{d}]_vg4x4", "vm44", "s", MergeNone, "aarch64_sme_smla_za64_vg4x4", [IsStreaming, IsSharedZA], []>;
-    def SVMLAL_MULTI_VG4x4_U16 : Inst<"svmla_za64[_{d}]_vg4x4", "vm44", "Us", MergeNone, "aarch64_sme_umla_za64_vg4x4", [IsStreaming, IsSharedZA], []>;
-  }
-
   // MULTI MLSL
   def SVMLSL_MULTI_VG2x2_F16 : Inst<"svmls_za32[_{d}]_vg2x2", "vm22", "bh", MergeNone, "aarch64_sme_fmlsl_vg2x2", [IsStreaming, IsSharedZA], []>;
   def SVMLSL_MULTI_VG2x4_F16 : Inst<"svmls_za32[_{d}]_vg2x4", "vm44", "bh", MergeNone, "aarch64_sme_fmlsl_vg2x4", [IsStreaming, IsSharedZA], []>;
@@ -424,13 +417,6 @@ let TargetGuard = "sme2" in {
   def SVMLSL_MULTI_VG4x4_S8 : Inst<"svmls_za32[_{d}]_vg4x4", "vm44", "c", MergeNone, "aarch64_sme_smls_za32_vg4x4", [IsStreaming, IsSharedZA], []>;
   def SVMLSL_MULTI_VG4x4_U8 : Inst<"svmls_za32[_{d}]_vg4x4", "vm44", "Uc", MergeNone, "aarch64_sme_umls_za32_vg4x4", [IsStreaming, IsSharedZA], []>;
 
-  let TargetGuard = "sme-i16i64" in {
-    def SVMLSL_MULTI_VG4x2_S16 : Inst<"svmls_za64[_{d}]_vg4x2", "vm22", "s", MergeNone, "aarch64_sme_smls_za64_vg4x2", [IsStreaming, IsSharedZA], []>;
-    def SVMLSL_MULTI_VG4x2_U16 : Inst<"svmls_za64[_{d}]_vg4x2", "vm22", "Us", MergeNone, "aarch64_sme_umls_za64_vg4x2", [IsStreaming, IsSharedZA], []>;
-    def SVMLSL_MULTI_VG4x4_S16 : Inst<"svmls_za64[_{d}]_vg4x4", "vm44", "s", MergeNone, "aarch64_sme_smls_za64_vg4x4", [IsStreaming, IsSharedZA], []>;
-    def SVMLSL_MULTI_VG4x4_U16 : Inst<"svmls_za64[_{d}]_vg4x4", "vm44", "Us", MergeNone, "aarch64_sme_umls_za64_vg4x4", [IsStreaming, IsSharedZA], []>;
-  }
-
   // SINGLE MLAL
   def SVMLAL_SINGLE_VG2x1_F16 : Inst<"svmla_za32[_{d}]_vg2x1",          "vmdd", "bh", MergeNone, "aarch64_sme_fmlal_single_vg2x1", [IsStreaming, IsSharedZA], []>;
   def SVMLAL_SINGLE_VG2x2_F16 : Inst<"svmla[_single]_za32[_{d}]_vg2x2", "vm2d", "bh", MergeNone, "aarch64_sme_fmlal_single_vg2x2", [IsStreaming, IsSharedZA], []>;
@@ -449,15 +435,6 @@ let TargetGuard = "sme2" in {
   def SVMLAL_SINGLE_VG4x4_S8  : Inst<"svmla[_single]_za32[_{d}]_vg4x4", "vm4d", "c", MergeNone, "aarch64_sme_smla_za32_single_vg4x4", [IsStreaming, IsSharedZA], []>;
   def SVMLAL_SINGLE_VG4x4_U8  : Inst<"svmla[_single]_za32[_{d}]_vg4x4", "vm4d", "Uc", MergeNone, "aarch64_sme_umla_za32_single_vg4x4", [IsStreaming, IsSharedZA], []>;
 
-  let TargetGuard = "sme-i16i64" in {
-    def SVMLAL_SINGLE_VG4x1_S16 : Inst<"svmla_za64[_{d}]_vg4x1",          "vmdd", "s", MergeNone, "aarch64_sme_smla_za64_single_vg4x1", [IsStreaming, IsSharedZA], []>;
-    def SVMLAL_SINGLE_VG4x1_U16 : Inst<"svmla_za64[_{d}]_vg4x1",          "vmdd", "Us", MergeNone, "aarch64_sme_umla_za64_single_vg4x1", [IsStreaming, IsSharedZA], []>;
-    def SVMLAL_SINGLE_VG4x2_S16 : Inst<"svmla[_single]_za64[_{d}]_vg4x2", "vm2d", "s", MergeNone, "aarch64_sme_smla_za64_single_vg4x2", [IsStreaming, IsSharedZA], []>;
-    def SVMLAL_SINGLE_VG4x2_U16 : Inst<"svmla[_single]_za64[_{d}]_vg4x2", "vm2d", "Us", MergeNone, "aarch64_sme_umla_za64_single_vg4x2", [IsStreaming, IsSharedZA], []>;
-    def SVMLAL_SINGLE_VG4x4_S16 : Inst<"svmla[_single]_za64[_{d}]_vg4x4", "vm4d", "s", MergeNone, "aarch64_sme_smla_za64_single_vg4x4", [IsStreaming, IsSharedZA], []>;
-    def SVMLAL_SINGLE_VG4x4_U16 : Inst<"svmla[_single]_za64[_{d}]_vg4x4", "vm4d", "Us", MergeNone, "aarch64_sme_umla_za64_single_vg4x4", [IsStreaming, IsSharedZA], []>;
-  }
-
   // SINGLE MLSL
   def SVMLSL_SINGLE_VG2x1_F16 : Inst<"svmls_za32[_{d}]_vg2x1",          "vmdd", "bh", MergeNone, "aarch64_sme_fmlsl_single_vg2x1", [IsStreaming, IsSharedZA], []>;
   def SVMLSL_SINGLE_VG2x2_F16 : Inst<"svmls[_single]_za32[_{d}]_vg2x2", "vm2d", "bh", MergeNone, "aarch64_sme_fmlsl_single_vg2x2", [IsStreaming, IsSharedZA], []>;
@@ -476,15 +453,6 @@ let TargetGuard = "sme2" in {
   def SVMLSL_SINGLE_VG4x4_S8  : Inst<"svmls[_single]_za32[_{d}]_vg4x4", "vm4d", "c", MergeNone, "aarch64_sme_smls_za32_single_vg4x4", [IsStreaming, IsSharedZA], []>;
   def SVMLSL_SINGLE_VG4x4_U8  : Inst<"svmls[_single]_za32[_{d}]_vg4x4", "vm4d", "Uc", MergeNone, "aarch64_sme_umls_za32_single_vg4x4", [IsStreaming, IsSharedZA], []>;
 
-  let TargetGuard = "sme-i16i64" in {
-    def SVMLSL_SINGLE_VG4x1_S16 : Inst<"svmls_za64[_{d}]_vg4x1",          "vmdd", "s", MergeNone, "aarch64_sme_smls_za64_single_vg4x1", [IsStreaming, IsSharedZA], []>;
-    def SVMLSL_SINGLE_VG4x1_U16 : Inst<"svmls_za64[_{d}]_vg4x1",          "vmdd", "Us", MergeNone, "aarch64_sme_umls_za64_single_vg4x1", [IsStreaming, IsSharedZA], []>;
-    def SVMLSL_SINGLE_VG4x2_S16 : Inst<"svmls[_single]_za64[_{d}]_vg4x2", "vm2d", "s", MergeNone, "aarch64_sme_smls_za64_single_vg4x2", [IsStreaming, IsSharedZA], []>;
-    def SVMLSL_SINGLE_VG4x2_U16 : Inst<"svmls[_single]_za64[_{d}]_vg4x2", "vm2d", "Us", MergeNone, "aarch64_sme_umls_za64_single_vg4x2", [IsStreaming, IsSharedZA], []>;
-    def SVMLSL_SINGLE_VG4x4_S16 : Inst<"svmls[_single]_za64[_{d}]_vg4x4", "vm4d", "s", MergeNone, "aarch64_sme_smls_za64_single_vg4x4", [IsStreaming, IsSharedZA], []>;
-    def SVMLSL_SINGLE_VG4x4_U16 : Inst<"svmls[_single]_za64[_{d}]_vg4x4", "vm4d", "Us", MergeNone, "aarch64_sme_umls_za64_single_vg4x4", [IsStreaming, IsSharedZA], []>;
-  }
-
   // INDEXED MLAL
   def SVMLAL_LANE_VG2x1_F16 : Inst<"svmla_lane_za32[_{d}]_vg2x1", "vmddi", "bh", MergeNone, "aarch64_sme_fmlal_lane_vg2x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
   def SVMLAL_LANE_VG2x2_F16 : Inst<"svmla_lane_za32[_{d}]_vg2x2", "vm2di", "bh", MergeNone, "aarch64_sme_fmlal_lane_vg2x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
@@ -503,15 +471,6 @@ let TargetGuard = "sme2" in {
   def SVMLAL_LANE_VG4x4_S8 : Inst<"svmla_lane_za32[_{d}]_vg4x4", "vm4di", "c", MergeNone, "aarch64_sme_smla_za32_lane_vg4x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
   def SVMLAL_LANE_VG4x4_U8 : Inst<"svmla_lane_za32[_{d}]_vg4x4", "vm4di", "Uc", MergeNone, "aarch64_sme_umla_za32_lane_vg4x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
 
-  let TargetGuard = "sme-i16i64" in {
-    def SVMLAL_LANE_VG4x1_S16 : Inst<"svmla_lane_za64[_{d}]_vg4x1", "vmddi", "s", MergeNone, "aarch64_sme_smla_za64_lane_vg4x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
-    def SVMLAL_LANE_VG4x1_U16 : Inst<"svmla_lane_za64[_{d}]_vg4x1", "vmddi", "Us", MergeNone, "aarch64_sme_umla_za64_lane_vg4x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
-    def SVMLAL_LANE_VG4x2_S16 : Inst<"svmla_lane_za64[_{d}]_vg4x2", "vm2di", "s", MergeNone, "aarch64_sme_smla_za64_lane_vg4x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
-    def SVMLAL_LANE_VG4x2_U16 : Inst<"svmla_lane_za64[_{d}]_vg4x2", "vm2di", "Us", MergeNone, "aarch64_sme_umla_za64_lane_vg4x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
-    def SVMLAL_LANE_VG4x4_S16 : Inst<"svmla_lane_za64[_{d}]_vg4x4", "vm4di", "s", MergeNone, "aarch64_sme_smla_za64_lane_vg4x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
-    def SVMLAL_LANE_VG4x4_U16 : Inst<"svmla_lane_za64[_{d}]_vg4x4", "vm4di", "Us", MergeNone, "aarch64_sme_umla_za64_lane_vg4x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
-  }
-
   // INDEXED MLSL
   def SVMLSL_LANE_VG2x1_F16 : Inst<"svmls_lane_za32[_{d}]_vg2x1", "vmddi", "bh", MergeNone, "aarch64_sme_fmlsl_lane_vg2x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
   def SVMLSL_LANE_VG2x2_F16 : Inst<"svmls_lane_za32[_{d}]_vg2x2", "vm2di", "bh", MergeNone, "aarch64_sme_fmlsl_lane_vg2x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
@@ -530,15 +489,6 @@ let TargetGuard = "sme2" in {
   def SVMLSL_LANE_VG4x4_S8 : Inst<"svmls_lane_za32[_{d}]_vg4x4", "vm4di", "c", MergeNone, "aarch64_sme_smls_za32_lane_vg4x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
   def SVMLSL_LANE_VG4x4_U8 : Inst<"svmls_lane_za32[_{d}]_vg4x4", "vm4di", "Uc", MergeNone, "aarch64_sme_umls_za32_lane_vg4x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
 
-  let TargetGuard = "sme-i16i64" in {
-    def SVMLSL_LANE_VG4x1_S16 : Inst<"svmls_lane_za64[_{d}]_vg4x1", "vmddi", "s", MergeNone, "aarch64_sme_smls_za64_lane_vg4x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
-    def SVMLSL_LANE_VG4x1_U16 : Inst<"svmls_lane_za64[_{d}]_vg4x1", "vmddi", "Us", MergeNone, "aarch64_sme_umls_za64_lane_vg4x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
-    def SVMLSL_LANE_VG4x2_S16 : Inst<"svmls_lane_za64[_{d}]_vg4x2", "vm2di", "s", MergeNone, "aarch64_sme_smls_za64_lane_vg4x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
-    def SVMLSL_LANE_VG4x2_U16 : Inst<"svmls_lane_za64[_{d}]_vg4x2", "vm2di", "Us", MergeNone, "aarch64_sme_umls_za64_lane_vg4x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
-    def SVMLSL_LANE_VG4x4_S16 : Inst<"svmls_lane_za64[_{d}]_vg4x4", "vm4di", "s", MergeNone, "aarch64_sme_smls_za64_lane_vg4x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
-    def SVMLSL_LANE_VG4x4_U16 : Inst<"svmls_lane_za64[_{d}]_vg4x4", "vm4di", "Us", MergeNone, "aarch64_sme_umls_za64_lane_vg4x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
-  }
-
   // SINGLE SUMLALL
   // Single sumla maps to usmla, with zn & zm operands swapped
   def SVSUMLALL_SINGLE_VG4x1 : Inst<"svsumla_za32[_{d}]_vg4x1",          "vmdu",   "c", MergeNone, "aarch64_sme_usmla_za32_single_vg4x1", [IsStreaming, IsSharedZA], []>;
@@ -570,6 +520,52 @@ let TargetGuard = "sme2" in {
   def SVUSMLALL_LANE_VG4x4 : Inst<"svusmla_lane_za32[_{d}]_vg4x4", "vm4xi", "Uc", MergeNone, "aarch64_sme_usmla_za32_lane_vg4x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
 }
 
+let TargetGuard = "sme2,sme-i16i64" in {
+  // MULTI MLAL
+  def SVMLAL_MULTI_VG4x2_S16 : Inst<"svmla_za64[_{d}]_vg4x2", "vm22", "s", MergeNone, "aarch64_sme_smla_za64_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_MULTI_VG4x2_U16 : Inst<"svmla_za64[_{d}]_vg4x2", "vm22", "Us", MergeNone, "aarch64_sme_umla_za64_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_MULTI_VG4x4_S16 : Inst<"svmla_za64[_{d}]_vg4x4", "vm44", "s", MergeNone, "aarch64_sme_smla_za64_vg4x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_MULTI_VG4x4_U16 : Inst<"svmla_za64[_{d}]_vg4x4", "vm44", "Us", MergeNone, "aarch64_sme_umla_za64_vg4x4", [IsStreaming, IsSharedZA], []>;
+
+  // MULTI MLSL
+  def SVMLSL_MULTI_VG4x2_S16 : Inst<"svmls_za64[_{d}]_vg4x2", "vm22", "s", MergeNone, "aarch64_sme_smls_za64_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_MULTI_VG4x2_U16 : Inst<"svmls_za64[_{d}]_vg4x2", "vm22", "Us", MergeNone, "aarch64_sme_umls_za64_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_MULTI_VG4x4_S16 : Inst<"svmls_za64[_{d}]_vg4x4", "vm44", "s", MergeNone, "aarch64_sme_smls_za64_vg4x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_MULTI_VG4x4_U16 : Inst<"svmls_za64[_{d}]_vg4x4", "vm44", "Us", MergeNone, "aarch64_sme_umls_za64_vg4x4", [IsStreaming, IsSharedZA], []>;
+
+  // SINGLE MLAL
+  def SVMLAL_SINGLE_VG4x1_S16 : Inst<"svmla_za64[_{d}]_vg4x1",          "vmdd", "s", MergeNone, "aarch64_sme_smla_za64_single_vg4x1", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_SINGLE_VG4x1_U16 : Inst<"svmla_za64[_{d}]_vg4x1",          "vmdd", "Us", MergeNone, "aarch64_sme_umla_za64_single_vg4x1", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_SINGLE_VG4x2_S16 : Inst<"svmla[_single]_za64[_{d}]_vg4x2", "vm2d", "s", MergeNone, "aarch64_sme_smla_za64_single_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_SINGLE_VG4x2_U16 : Inst<"svmla[_single]_za64[_{d}]_vg4x2", "vm2d", "Us", MergeNone, "aarch64_sme_umla_za64_single_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_SINGLE_VG4x4_S16 : Inst<"svmla[_single]_za64[_{d}]_vg4x4", "vm4d", "s", MergeNone, "aarch64_sme_smla_za64_single_vg4x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_SINGLE_VG4x4_U16 : Inst<"svmla[_single]_za64[_{d}]_vg4x4", "vm4d", "Us", MergeNone, "aarch64_sme_umla_za64_single_vg4x4", [IsStreaming, IsSharedZA], []>;
+
+  // SINGLE MLSL
+  def SVMLSL_SINGLE_VG4x1_S16 : Inst<"svmls_za64[_{d}]_vg4x1",          "vmdd", "s", MergeNone, "aarch64_sme_smls_za64_single_vg4x1", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_SINGLE_VG4x1_U16 : Inst<"svmls_za64[_{d}]_vg4x1",          "vmdd", "Us", MergeNone, "aarch64_sme_umls_za64_single_vg4x1", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_SINGLE_VG4x2_S16 : Inst<"svmls[_single]_za64[_{d}]_vg4x2", "vm2d", "s", MergeNone, "aarch64_sme_smls_za64_single_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_SINGLE_VG4x2_U16 : Inst<"svmls[_single]_za64[_{d}]_vg4x2", "vm2d", "Us", MergeNone, "aarch64_sme_umls_za64_single_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_SINGLE_VG4x4_S16 : Inst<"svmls[_single]_za64[_{d}]_vg4x4", "vm4d", "s", MergeNone, "aarch64_sme_smls_za64_single_vg4x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_SINGLE_VG4x4_U16 : Inst<"svmls[_single]_za64[_{d}]_vg4x4", "vm4d", "Us", MergeNone, "aarch64_sme_umls_za64_single_vg4x4", [IsStreaming, IsSharedZA], []>;
+
+  // INDEXED MLAL
+  def SVMLAL_LANE_VG4x1_S16 : Inst<"svmla_lane_za64[_{d}]_vg4x1", "vmddi", "s", MergeNone, "aarch64_sme_smla_za64_lane_vg4x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLAL_LANE_VG4x1_U16 : Inst<"svmla_lane_za64[_{d}]_vg4x1", "vmddi", "Us", MergeNone, "aarch64_sme_umla_za64_lane_vg4x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLAL_LANE_VG4x2_S16 : Inst<"svmla_lane_za64[_{d}]_vg4x2", "vm2di", "s", MergeNone, "aarch64_sme_smla_za64_lane_vg4x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLAL_LANE_VG4x2_U16 : Inst<"svmla_lane_za64[_{d}]_vg4x2", "vm2di", "Us", MergeNone, "aarch64_sme_umla_za64_lane_vg4x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLAL_LANE_VG4x4_S16 : Inst<"svmla_lane_za64[_{d}]_vg4x4", "vm4di", "s", MergeNone, "aarch64_sme_smla_za64_lane_vg4x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLAL_LANE_VG4x4_U16 : Inst<"svmla_lane_za64[_{d}]_vg4x4", "vm4di", "Us", MergeNone, "aarch64_sme_umla_za64_lane_vg4x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+
+  // INDEXED MLSL
+  def SVMLSL_LANE_VG4x1_S16 : Inst<"svmls_lane_za64[_{d}]_vg4x1", "vmddi", "s", MergeNone, "aarch64_sme_smls_za64_lane_vg4x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLSL_LANE_VG4x1_U16 : Inst<"svmls_lane_za64[_{d}]_vg4x1", "vmddi", "Us", MergeNone, "aarch64_sme_umls_za64_lane_vg4x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLSL_LANE_VG4x2_S16 : Inst<"svmls_lane_za64[_{d}]_vg4x2", "vm2di", "s", MergeNone, "aarch64_sme_smls_za64_lane_vg4x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLSL_LANE_VG4x2_U16 : Inst<"svmls_lane_za64[_{d}]_vg4x2", "vm2di", "Us", MergeNone, "aarch64_sme_umls_za64_lane_vg4x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLSL_LANE_VG4x4_S16 : Inst<"svmls_lane_za64[_{d}]_vg4x4", "vm4di", "s", MergeNone, "aarch64_sme_smls_za64_lane_vg4x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLSL_LANE_VG4x4_U16 : Inst<"svmls_lane_za64[_{d}]_vg4x4", "vm4di", "Us", MergeNone, "aarch64_sme_umls_za64_lane_vg4x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+}
+
 //
 // Spill and fill of ZT0
 //

>From 854756a1f18947b3894dc1d66341d180703b7056 Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <Dinar.Temirbulatov at arm.com>
Date: Sun, 17 Dec 2023 00:34:50 +0000
Subject: [PATCH 5/7] Add extra tests.

---
 .../aarch64-sme2-intrinsics/acle_sme2_mla.c   | 292 ++++++++++++++++++
 .../aarch64-sme2-intrinsics/acle_sme2_mls.c   | 292 ++++++++++++++++++
 2 files changed, 584 insertions(+)
 create mode 100644 clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mla.c
 create mode 100644 clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mls.c

diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mla.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mla.c
new file mode 100644
index 00000000000000..b1992ae3ee9159
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mla.c
@@ -0,0 +1,292 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+#include <arm_sme_draft_spec_subject_to_change.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5
+#endif
+
+//
+// Multi, multi
+// CHECK-LABEL: @test_svmla2_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZM]], i64 4)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmla2_f32j13svfloat32x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZM]], i64 4)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32x2_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmla,,_za32,_f32,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla4_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM]], i64 4)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM]], i64 12)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], <vscale x 4 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmla4_f32j13svfloat32x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM]], i64 12)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], <vscale x 4 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32x4_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmla,,_za32,_f32,_vg1x4)(slice_base, zn, zm);
+}
+
+//
+// Multi, single
+// CHECK-LABEL: @test_svmla_single2_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmla_single2_f32j13svfloat32x2_tu13__SVFloat32_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmla,_single,_za32,_f32,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single4_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmla_single4_f32j13svfloat32x4_tu13__SVFloat32_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmla,_single,_za32,_f32,_vg1x4)(slice_base, zn, zm);
+}
+
+//
+// Multi, indexed
+// CHECK-LABEL: @test_svmla_lane2_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmla_lane2_f32j13svfloat32x2_tu13__SVFloat32_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_lane2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmla_lane,,_za32,_f32,_vg1x2)(slice_base, zn, zm, 3);
+}
+
+// CHECK-LABEL: @test_svmla_lane4_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmla_lane4_f32j13svfloat32x4_tu13__SVFloat32_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_lane4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmla_lane,,_za32,_f32,_vg1x4)(slice_base, zn, zm, 3);
+}
+
+//
+// Multi, multi
+// CHECK-LABEL: @test_svmla2_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZM]], i64 2)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmla2_f64j13svfloat64x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZM]], i64 2)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64x2_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmla,,_za64,_f64,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla4_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 6)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM]], i64 2)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM]], i64 4)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM]], i64 6)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], <vscale x 2 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmla4_f64j13svfloat64x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 6)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM]], i64 6)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], <vscale x 2 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64x4_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmla,,_za64,_f64,_vg1x4)(slice_base, zn, zm);
+}
+
+//
+// Multi, single
+// CHECK-LABEL: @test_svmla_single2_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN]], i64 2)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmla_single2_f64j13svfloat64x2_tu13__SVFloat64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN]], i64 2)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmla,_single,_za64,_f64,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single4_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 6)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmla_single4_f64j13svfloat64x4_tu13__SVFloat64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 6)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmla,_single,_za64,_f64,_vg1x4)(slice_base, zn, zm);
+}
+
+//
+// Multi, indexed
+// CHECK-LABEL: @test_svmla_lane2_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN]], i64 2)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[ZM:%.*]], i32 1)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmla_lane2_f64j13svfloat64x2_tu13__SVFloat64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN]], i64 2)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[ZM:%.*]], i32 1)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_lane2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmla_lane,,_za64,_f64,_vg1x2)(slice_base, zn, zm, 1);
+}
+
+// CHECK-LABEL: @test_svmla_lane4_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 6)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[ZM:%.*]], i32 1)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmla_lane4_f64j13svfloat64x4_tu13__SVFloat64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 6)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[ZM:%.*]], i32 1)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_lane4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmla_lane,,_za64,_f64,_vg1x4)(slice_base, zn, zm, 1);
+}
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mls.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mls.c
new file mode 100644
index 00000000000000..6e48c330bbedfa
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mls.c
@@ -0,0 +1,292 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+#include <arm_sme_draft_spec_subject_to_change.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5
+#endif
+
+//
+// Multi, multi
+// CHECK-LABEL: @test_svmls2_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZM]], i64 4)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmls2_f32j13svfloat32x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZM]], i64 4)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32x2_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmls,,_za32,_f32,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls4_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM]], i64 4)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM]], i64 12)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], <vscale x 4 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmls4_f32j13svfloat32x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM]], i64 12)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], <vscale x 4 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32x4_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmls,,_za32,_f32,_vg1x4)(slice_base, zn, zm);
+}
+
+//
+// Multi, single
+// CHECK-LABEL: @test_svmls_single2_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmls_single2_f32j13svfloat32x2_tu13__SVFloat32_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmls,_single,_za32,_f32,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single4_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmls_single4_f32j13svfloat32x4_tu13__SVFloat32_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmls,_single,_za32,_f32,_vg1x4)(slice_base, zn, zm);
+}
+
+//
+// Multi, indexed
+// CHECK-LABEL: @test_svmls_lane2_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmls_lane2_f32j13svfloat32x2_tu13__SVFloat32_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_lane2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmls_lane,,_za32,_f32,_vg1x2)(slice_base, zn, zm, 3);
+}
+
+// CHECK-LABEL: @test_svmls_lane4_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmls_lane4_f32j13svfloat32x4_tu13__SVFloat32_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_lane4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmls_lane,,_za32,_f32,_vg1x4)(slice_base, zn, zm, 3);
+}
+
+//
+// Multi, multi
+// CHECK-LABEL: @test_svmls2_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZM]], i64 2)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmls2_f64j13svfloat64x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZM]], i64 2)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64x2_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmls,,_za64,_f64,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls4_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 6)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM]], i64 2)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM]], i64 4)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM]], i64 6)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], <vscale x 2 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmls4_f64j13svfloat64x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 6)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM]], i64 6)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], <vscale x 2 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64x4_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmls,,_za64,_f64,_vg1x4)(slice_base, zn, zm);
+}
+
+//
+// Multi, single
+// CHECK-LABEL: @test_svmls_single2_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN]], i64 2)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmls_single2_f64j13svfloat64x2_tu13__SVFloat64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN]], i64 2)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmls,_single,_za64,_f64,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single4_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 6)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmls_single4_f64j13svfloat64x4_tu13__SVFloat64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 6)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmls,_single,_za64,_f64,_vg1x4)(slice_base, zn, zm);
+}
+
+//
+// Multi, indexed
+// CHECK-LABEL: @test_svmls_lane2_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN]], i64 2)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[ZM:%.*]], i32 1)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmls_lane2_f64j13svfloat64x2_tu13__SVFloat64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN]], i64 2)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[ZM:%.*]], i32 1)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_lane2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmls_lane,,_za64,_f64,_vg1x2)(slice_base, zn, zm, 1);
+}
+
+// CHECK-LABEL: @test_svmls_lane4_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 6)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[ZM:%.*]], i32 1)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmls_lane4_f64j13svfloat64x4_tu13__SVFloat64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 6)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[ZM:%.*]], i32 1)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_lane4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmls_lane,,_za64,_f64,_vg1x4)(slice_base, zn, zm, 1);
+}

>From e24765d9d4d6f5bc66233dd199a8345375260443 Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <Dinar.Temirbulatov at arm.com>
Date: Mon, 18 Dec 2023 14:29:15 +0000
Subject: [PATCH 6/7] Added test to acle_sme2_imm.cpp

---
 .../aarch64-sme2-intrinsics/acle_sme2_imm.cpp | 44 +++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp
index 6ab6dabd92b198..d07d83a53e4623 100644
--- a/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp
+++ b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp
@@ -241,3 +241,47 @@ void test_bfmlslb_bad_lane(svfloat32_t zda, svbfloat16_t zn, svbfloat16_t zm) __
   svbfmlslb_lane_f32(zda, zn, zm, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
   svbfmlslt_lane_f32(zda, zn, zm, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
 }
+
+void test_multiply_add_sub_long(uint32_t base, svint8_t s8, svuint8_t u8,
+                                svint16_t s16, svuint16_t u16, svint8x2_t s8x2,
+                                svuint8x2_t u8x2, svint16x2_t s16x2, svuint16x2_t u16x2,
+                                svint8x4_t s8x4, svuint8x4_t u8x4, svint16x4_t s16x4, svuint16x4_t u16x4) __arm_streaming __arm_shared_za {
+
+  svmla_lane_za32_s8_vg4x1(base, s8, s8, 16);   // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  svmla_lane_za32_u8_vg4x1(base, u8, u8, 16);   // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  svmla_lane_za64_s16_vg4x1(base, s16, s16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  svmla_lane_za64_u16_vg4x1(base, u16, u16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+
+  svmla_lane_za32_s8_vg4x2(base, s8x2, s8, 16);   // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  svmla_lane_za32_u8_vg4x2(base, u8x2, u8, 16);   // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  svmla_lane_za64_s16_vg4x2(base, s16x2, s16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  svmla_lane_za64_u16_vg4x2(base, u16x2, u16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+
+  svmla_lane_za32_s8_vg4x4(base, s8x4, s8, 16);   // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  svmla_lane_za32_u8_vg4x4(base, u8x4, u8, 16);   // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  svmla_lane_za64_s16_vg4x4(base, s16x4, s16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  svmla_lane_za64_u16_vg4x4(base, u16x4, u16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+
+  svmls_lane_za32_s8_vg4x1(base, s8, s8, 16);   // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  svmls_lane_za32_u8_vg4x1(base, u8, u8, 16);   // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  svmls_lane_za64_s16_vg4x1(base, s16, s16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  svmls_lane_za64_u16_vg4x1(base, u16, u16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+
+  svmls_lane_za32_s8_vg4x2(base, s8x2, s8, 16);   // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  svmls_lane_za32_u8_vg4x2(base, u8x2, u8, 16);   // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  svmls_lane_za64_s16_vg4x2(base, s16x2, s16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  svmls_lane_za64_u16_vg4x2(base, u16x2, u16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+
+  svmls_lane_za32_s8_vg4x4(base, s8x4, s8, 16);   // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  svmls_lane_za32_u8_vg4x4(base, u8x4, u8, 16);   // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  svmls_lane_za64_s16_vg4x4(base, s16x4, s16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  svmls_lane_za64_u16_vg4x4(base, u16x4, u16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+
+  svsumla_lane_za32_s8_vg4x1(base, s8, u8, 16);  // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  svsumla_lane_za32_s8_vg4x2(base, s8x2, u8, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  svsumla_lane_za32_s8_vg4x4(base, s8x4, u8, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+
+  svusmla_lane_za32_u8_vg4x1(base, u8, s8, 16);  // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  svusmla_lane_za32_u8_vg4x2(base, u8x2, s8, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  svusmla_lane_za32_u8_vg4x4(base, u8x4, s8, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+}

>From 86ad0aad0c23e2e37180c5b4af2fac58dad30b46 Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <Dinar.Temirbulatov at arm.com>
Date: Tue, 19 Dec 2023 23:51:52 +0000
Subject: [PATCH 7/7] Adjust MACRO usage to the remark in tests.

---
 .../aarch64-sme2-intrinsics/acle_sme2_mla.c   | 24 +++++------
 .../aarch64-sme2-intrinsics/acle_sme2_mlal.c  | 40 +++++++++----------
 .../aarch64-sme2-intrinsics/acle_sme2_mlall.c | 40 +++++++++----------
 .../aarch64-sme2-intrinsics/acle_sme2_mls.c   | 24 +++++------
 .../aarch64-sme2-intrinsics/acle_sme2_mlsl.c  | 40 +++++++++----------
 5 files changed, 84 insertions(+), 84 deletions(-)

diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mla.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mla.c
index b1992ae3ee9159..f52edd9888daa9 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mla.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mla.c
@@ -36,7 +36,7 @@
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmla2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32x2_t zm) __arm_streaming __arm_shared_za {
-  SVE_ACLE_FUNC(svmla,,_za32,_f32,_vg1x2)(slice_base, zn, zm);
+  SVE_ACLE_FUNC(svmla_za32,_f32,_vg1x2,,)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmla4_f32(
@@ -66,7 +66,7 @@ void test_svmla2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32x2_t zm) __
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmla4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32x4_t zm) __arm_streaming __arm_shared_za {
-  SVE_ACLE_FUNC(svmla,,_za32,_f32,_vg1x4)(slice_base, zn, zm);
+  SVE_ACLE_FUNC(svmla_za32,_f32,_vg1x4,,)(slice_base, zn, zm);
 }
 
 //
@@ -86,7 +86,7 @@ void test_svmla4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32x4_t zm) __
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmla_single2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32_t zm) __arm_streaming __arm_shared_za {
-  SVE_ACLE_FUNC(svmla,_single,_za32,_f32,_vg1x2)(slice_base, zn, zm);
+  SVE_ACLE_FUNC(svmla_single_za32,,_f32,,_vg1x2)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmla_single4_f32(
@@ -108,7 +108,7 @@ void test_svmla_single2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32_t z
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmla_single4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32_t zm) __arm_streaming __arm_shared_za {
-  SVE_ACLE_FUNC(svmla,_single,_za32,_f32,_vg1x4)(slice_base, zn, zm);
+  SVE_ACLE_FUNC(svmla_single_za32,,_f32,,_vg1x4)(slice_base, zn, zm);
 }
 
 //
@@ -128,7 +128,7 @@ void test_svmla_single4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32_t z
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmla_lane2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32_t zm) __arm_streaming __arm_shared_za {
-  SVE_ACLE_FUNC(svmla_lane,,_za32,_f32,_vg1x2)(slice_base, zn, zm, 3);
+  SVE_ACLE_FUNC(svmla_lane_za32,_f32,_vg1x2,,)(slice_base, zn, zm, 3);
 }
 
 // CHECK-LABEL: @test_svmla_lane4_f32(
@@ -150,7 +150,7 @@ void test_svmla_lane2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32_t zm)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmla_lane4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32_t zm) __arm_streaming __arm_shared_za {
-  SVE_ACLE_FUNC(svmla_lane,,_za32,_f32,_vg1x4)(slice_base, zn, zm, 3);
+  SVE_ACLE_FUNC(svmla_lane_za32,_f32,_vg1x4,,)(slice_base, zn, zm, 3);
 }
 
 //
@@ -174,7 +174,7 @@ void test_svmla_lane4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32_t zm)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmla2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64x2_t zm) __arm_streaming __arm_shared_za {
-  SVE_ACLE_FUNC(svmla,,_za64,_f64,_vg1x2)(slice_base, zn, zm);
+  SVE_ACLE_FUNC(svmla_za64,_f64,_vg1x2,,)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmla4_f64(
@@ -204,7 +204,7 @@ void test_svmla2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64x2_t zm) __
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmla4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64x4_t zm) __arm_streaming __arm_shared_za {
-  SVE_ACLE_FUNC(svmla,,_za64,_f64,_vg1x4)(slice_base, zn, zm);
+  SVE_ACLE_FUNC(svmla_za64,_f64,_vg1x4,,)(slice_base, zn, zm);
 }
 
 //
@@ -224,7 +224,7 @@ void test_svmla4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64x4_t zm) __
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmla_single2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64_t zm) __arm_streaming __arm_shared_za {
-  SVE_ACLE_FUNC(svmla,_single,_za64,_f64,_vg1x2)(slice_base, zn, zm);
+  SVE_ACLE_FUNC(svmla_single_za64,,_f64,,_vg1x2)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmla_single4_f64(
@@ -246,7 +246,7 @@ void test_svmla_single2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64_t z
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmla_single4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64_t zm) __arm_streaming __arm_shared_za {
-  SVE_ACLE_FUNC(svmla,_single,_za64,_f64,_vg1x4)(slice_base, zn, zm);
+  SVE_ACLE_FUNC(svmla_single_za64,,_f64,,_vg1x4)(slice_base, zn, zm);
 }
 
 //
@@ -266,7 +266,7 @@ void test_svmla_single4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64_t z
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmla_lane2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64_t zm) __arm_streaming __arm_shared_za {
-  SVE_ACLE_FUNC(svmla_lane,,_za64,_f64,_vg1x2)(slice_base, zn, zm, 1);
+  SVE_ACLE_FUNC(svmla_lane_za64,_f64,_vg1x2,,)(slice_base, zn, zm, 1);
 }
 
 // CHECK-LABEL: @test_svmla_lane4_f64(
@@ -288,5 +288,5 @@ void test_svmla_lane2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64_t zm)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmla_lane4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64_t zm) __arm_streaming __arm_shared_za {
-  SVE_ACLE_FUNC(svmla_lane,,_za64,_f64,_vg1x4)(slice_base, zn, zm, 1);
+  SVE_ACLE_FUNC(svmla_lane_za64,_f64,_vg1x4,,)(slice_base, zn, zm, 1);
 }
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlal.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlal.c
index c216e4cce944fd..834ade75350767 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlal.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlal.c
@@ -311,7 +311,7 @@ void test_svmla_single1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __a
 //
 void test_svmla_single2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmla,_single,_za32,_f16,_vg2x2)(slice_base, zn, zm);
+   SVE_ACLE_FUNC(svmla_single_za32,,_f16,,_vg2x2)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmla_single2_bf16(
@@ -330,7 +330,7 @@ void test_svmla_single2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t z
 //
 void test_svmla_single2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmla,_single,_za32,_bf16,_vg2x2)(slice_base, zn, zm);
+   SVE_ACLE_FUNC(svmla_single_za32,,_bf16,,_vg2x2)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmla_single2_u16(
@@ -349,7 +349,7 @@ void test_svmla_single2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_
 //
 void test_svmla_single2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmla,_single,_za32,_u16,_vg2x2)(slice_base, zn, zm);
+   SVE_ACLE_FUNC(svmla_single_za32,,_u16,,_vg2x2)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmla_single2_s16(
@@ -368,7 +368,7 @@ void test_svmla_single2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm)
 //
 void test_svmla_single2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmla,_single,_za32,_s16,_vg2x2)(slice_base, zn, zm);
+   SVE_ACLE_FUNC(svmla_single_za32,,_s16,,_vg2x2)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmla_single4_f16(
@@ -391,7 +391,7 @@ void test_svmla_single2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) _
 //
 void test_svmla_single4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmla,_single,_za32,_f16,_vg2x4)(slice_base, zn, zm);
+   SVE_ACLE_FUNC(svmla_single_za32,,_f16,,_vg2x4)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmla_single4_bf16(
@@ -414,7 +414,7 @@ void test_svmla_single4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t z
 //
 void test_svmla_single4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmla,_single,_za32,_bf16,_vg2x4)(slice_base, zn, zm);
+   SVE_ACLE_FUNC(svmla_single_za32,,_bf16,,_vg2x4)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmla_single4_u16(
@@ -437,7 +437,7 @@ void test_svmla_single4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_
 //
 void test_svmla_single4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmla,_single,_za32,_u16,_vg2x4)(slice_base, zn, zm);
+   SVE_ACLE_FUNC(svmla_single_za32,,_u16,,_vg2x4)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmla_single4_s16(
@@ -460,7 +460,7 @@ void test_svmla_single4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm)
 //
 void test_svmla_single4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmla,_single,_za32,_s16,_vg2x4)(slice_base, zn, zm);
+   SVE_ACLE_FUNC(svmla_single_za32,,_s16,,_vg2x4)(slice_base, zn, zm);
 }
 
 //
@@ -479,7 +479,7 @@ void test_svmla_single4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) _
 //
 void test_svmla_lane1_f16(uint32_t slice_base, svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmla_lane,,_za32,_f16,_vg2x1)(slice_base, zn, zm, 7);
+   SVE_ACLE_FUNC(svmla_lane_za32,,,_f16,_vg2x1)(slice_base, zn, zm, 7);
 }
 
 // CHECK-LABEL: @test_svmla_lane1_bf16(
@@ -494,7 +494,7 @@ void test_svmla_lane1_f16(uint32_t slice_base, svfloat16_t zn, svfloat16_t zm) _
 //
 void test_svmla_lane1_bf16(uint32_t slice_base, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmla_lane,,_za32,_bf16,_vg2x1)(slice_base, zn, zm, 7);
+   SVE_ACLE_FUNC(svmla_lane_za32,,,_bf16,_vg2x1)(slice_base, zn, zm, 7);
 }
 
 // CHECK-LABEL: @test_svmla_lane1_u16(
@@ -509,7 +509,7 @@ void test_svmla_lane1_bf16(uint32_t slice_base, svbfloat16_t zn, svbfloat16_t zm
 //
 void test_svmla_lane1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmla_lane,,_za32,_u16,_vg2x1)(slice_base, zn, zm, 7);
+   SVE_ACLE_FUNC(svmla_lane_za32,,,_u16,_vg2x1)(slice_base, zn, zm, 7);
 }
 
 // CHECK-LABEL: @test_svmla_lane1_s16(
@@ -524,7 +524,7 @@ void test_svmla_lane1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) __a
 //
 void test_svmla_lane1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmla_lane,,_za32,_s16,_vg2x1)(slice_base, zn, zm, 7);
+   SVE_ACLE_FUNC(svmla_lane_za32,,,_s16,_vg2x1)(slice_base, zn, zm, 7);
 }
 
 // CHECK-LABEL: @test_svmla_lane2_f16(
@@ -543,7 +543,7 @@ void test_svmla_lane1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __arm
 //
 void test_svmla_lane2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmla_lane,,_za32,_f16,_vg2x2)(slice_base, zn, zm, 7);
+   SVE_ACLE_FUNC(svmla_lane_za32,,,_f16,_vg2x2)(slice_base, zn, zm, 7);
 }
 
 // CHECK-LABEL: @test_svmla_lane2_bf16(
@@ -562,7 +562,7 @@ void test_svmla_lane2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm)
 //
 void test_svmla_lane2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmla_lane,,_za32,_bf16,_vg2x2)(slice_base, zn, zm, 7);
+   SVE_ACLE_FUNC(svmla_lane_za32,,,_bf16,_vg2x2)(slice_base, zn, zm, 7);
 }
 
 // CHECK-LABEL: @test_svmla_lane2_u16(
@@ -581,7 +581,7 @@ void test_svmla_lane2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t
 //
 void test_svmla_lane2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmla_lane,,_za32,_u16,_vg2x2)(slice_base, zn, zm, 7);
+   SVE_ACLE_FUNC(svmla_lane_za32,,,_u16,_vg2x2)(slice_base, zn, zm, 7);
 }
 
 // CHECK-LABEL: @test_svmla_lane2_s16(
@@ -600,7 +600,7 @@ void test_svmla_lane2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) _
 //
 void test_svmla_lane2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmla_lane,,_za32,_s16,_vg2x2)(slice_base, zn, zm, 7);
+   SVE_ACLE_FUNC(svmla_lane_za32,,,_s16,_vg2x2)(slice_base, zn, zm, 7);
 }
 
 // CHECK-LABEL: @test_svmla_lane4_f16(
@@ -623,7 +623,7 @@ void test_svmla_lane2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __a
 //
 void test_svmla_lane4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmla_lane,,_za32,_f16,_vg2x4)(slice_base, zn, zm, 7);
+   SVE_ACLE_FUNC(svmla_lane_za32,,,_f16,_vg2x4)(slice_base, zn, zm, 7);
 }
 
 // CHECK-LABEL: @test_svmla_lane4_bf16(
@@ -646,7 +646,7 @@ void test_svmla_lane4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t zm)
 //
 void test_svmla_lane4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmla_lane,,_za32,_bf16,_vg2x4)(slice_base, zn, zm, 7);
+   SVE_ACLE_FUNC(svmla_lane_za32,,,_bf16,_vg2x4)(slice_base, zn, zm, 7);
 }
 
 // CHECK-LABEL: @test_svmla_lane4_u16(
@@ -669,7 +669,7 @@ void test_svmla_lane4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_t
 //
 void test_svmla_lane4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmla_lane,,_za32,_u16,_vg2x4)(slice_base, zn, zm, 7);
+   SVE_ACLE_FUNC(svmla_lane_za32,,,_u16,_vg2x4)(slice_base, zn, zm, 7);
 }
 
 // CHECK-LABEL: @test_svmla_lane4_s16(
@@ -692,5 +692,5 @@ void test_svmla_lane4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) _
 //
 void test_svmla_lane4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmla_lane,,_za32,_s16,_vg2x4)(slice_base, zn, zm, 7);
+   SVE_ACLE_FUNC(svmla_lane_za32,,,_s16,_vg2x4)(slice_base, zn, zm, 7);
 }
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlall.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlall.c
index 44e715e9581b7c..fceb16a482600d 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlall.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlall.c
@@ -199,7 +199,7 @@ void test_usmlall_single_x1_u8(uint32_t slice_base, svuint8_t zn, svint8_t zm) _
 //
 void test_svmla_single_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmla,_single,_za32,_s8,_vg4x2)(slice_base, zn, zm);
+  SVE_ACLE_FUNC(svmla_single_za32,,_s8,,_vg4x2)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmla_single_x2_s16(
@@ -218,7 +218,7 @@ void test_svmla_single_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8_t zm) __
 //
 void test_svmla_single_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmla,_single,_za64,_s16,_vg4x2)(slice_base, zn, zm);
+  SVE_ACLE_FUNC(svmla_single_za64,,_s16,,_vg4x2)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmla_single_x2_u8(
@@ -237,7 +237,7 @@ void test_svmla_single_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm)
 //
 void test_svmla_single_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmla,_single,_za32,_u8,_vg4x2)(slice_base, zn, zm);
+  SVE_ACLE_FUNC(svmla_single_za32,,_u8,,_vg4x2)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmla_single_x2_u16(
@@ -256,7 +256,7 @@ void test_svmla_single_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8_t zm)
 //
 void test_svmla_single_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmla,_single,_za64,_u16,_vg4x2)(slice_base, zn, zm);
+  SVE_ACLE_FUNC(svmla_single_za64,,_u16,,_vg4x2)(slice_base, zn, zm);
 }
 
 // MLSL
@@ -277,7 +277,7 @@ void test_svmla_single_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t z
 //
 void test_svmls_single_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmls,_single,_za32,_s8,_vg4x2)(slice_base, zn, zm);
+  SVE_ACLE_FUNC(svmls_single_za32,,_s8,,_vg4x2)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmls_single_x2_s16(
@@ -296,7 +296,7 @@ void test_svmls_single_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8_t zm) __
 //
 void test_svmls_single_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmls,_single,_za64,_s16,_vg4x2)(slice_base, zn, zm);
+  SVE_ACLE_FUNC(svmls_single_za64,,_s16,,_vg4x2)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmls_single_x2_u8(
@@ -315,7 +315,7 @@ void test_svmls_single_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm)
 //
 void test_svmls_single_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmls,_single,_za32,_u8,_vg4x2)(slice_base, zn, zm);
+  SVE_ACLE_FUNC(svmls_single_za32,,_u8,,_vg4x2)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmls_single_x2_u16(
@@ -334,7 +334,7 @@ void test_svmls_single_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8_t zm)
 //
 void test_svmls_single_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmls,_single,_za64,_u16,_vg4x2)(slice_base, zn, zm);
+  SVE_ACLE_FUNC(svmls_single_za64,,_u16,,_vg4x2)(slice_base, zn, zm);
 }
 
 // SUMLALL
@@ -355,7 +355,7 @@ void test_svmls_single_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t z
 //
 void test_svsumla_single_x2_s8(uint32_t slice_base, svint8x2_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svsumla,_single,_za32,_s8,_vg4x2)(slice_base, zn, zm);
+  SVE_ACLE_FUNC(svsumla_single_za32,,_s8,,_vg4x2)(slice_base, zn, zm);
 }
 
 // USMLALL
@@ -376,7 +376,7 @@ void test_svsumla_single_x2_s8(uint32_t slice_base, svint8x2_t zn, svuint8_t zm)
 //
 void test_usmlall_single_x2_u8(uint32_t slice_base, svuint8x2_t zn, svint8_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svusmla,_single,_za32,_u8,_vg4x2)(slice_base, zn, zm);
+  SVE_ACLE_FUNC(svusmla_single_za32,,_u8,,_vg4x2)(slice_base, zn, zm);
 }
 
 //
@@ -405,7 +405,7 @@ void test_usmlall_single_x2_u8(uint32_t slice_base, svuint8x2_t zn, svint8_t zm)
 //
 void test_svmla_single_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmla,_single,_za32,_s8,_vg4x4)(slice_base, zn, zm);
+  SVE_ACLE_FUNC(svmla_single_za32,,_s8,,_vg4x4)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmla_single_x4_s16(
@@ -428,7 +428,7 @@ void test_svmla_single_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8_t zm) __
 //
 void test_svmla_single_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmla,_single,_za64,_s16,_vg4x4)(slice_base, zn, zm);
+  SVE_ACLE_FUNC(svmla_single_za64,,_s16,,_vg4x4)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmla_single_x4_u8(
@@ -451,7 +451,7 @@ void test_svmla_single_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm)
 //
 void test_svmla_single_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmla,_single,_za32,_u8,_vg4x4)(slice_base, zn, zm);
+  SVE_ACLE_FUNC(svmla_single_za32,,_u8,,_vg4x4)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmla_single_x4_u16(
@@ -474,7 +474,7 @@ void test_svmla_single_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm)
 //
 void test_svmla_single_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmla,_single,_za64,_u16,_vg4x4)(slice_base, zn, zm);
+  SVE_ACLE_FUNC(svmla_single_za64,,_u16,,_vg4x4)(slice_base, zn, zm);
 }
 
 // MLSL
@@ -499,7 +499,7 @@ void test_svmla_single_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t z
 //
 void test_svmls_single_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmls,_single,_za32,_s8,_vg4x4)(slice_base, zn, zm);
+  SVE_ACLE_FUNC(svmls_single_za32,,_s8,,_vg4x4)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmls_single_x4_s16(
@@ -522,7 +522,7 @@ void test_svmls_single_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8_t zm) __
 //
 void test_svmls_single_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmls,_single,_za64,_s16,_vg4x4)(slice_base, zn, zm);
+  SVE_ACLE_FUNC(svmls_single_za64,,_s16,,_vg4x4)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmls_single_x4_u8(
@@ -545,7 +545,7 @@ void test_svmls_single_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm)
 //
 void test_svmls_single_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmls,_single,_za32,_u8,_vg4x4)(slice_base, zn, zm);
+  SVE_ACLE_FUNC(svmls_single_za32,,_u8,,_vg4x4)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmls_single_x4_u16(
@@ -568,7 +568,7 @@ void test_svmls_single_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm)
 //
 void test_svmls_single_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svmls,_single,_za64,_u16,_vg4x4)(slice_base, zn, zm);
+  SVE_ACLE_FUNC(svmls_single_za64,,_u16,,_vg4x4)(slice_base, zn, zm);
 }
 
 // SUMLALL
@@ -593,7 +593,7 @@ void test_svmls_single_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t z
 //
 void test_svsumla_single_x4_s8(uint32_t slice_base, svint8x4_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svsumla,_single,_za32,_s8,_vg4x4)(slice_base, zn, zm);
+  SVE_ACLE_FUNC(svsumla_single_za32,,_s8,,_vg4x4)(slice_base, zn, zm);
 }
 
 // USMLALL
@@ -618,7 +618,7 @@ void test_svsumla_single_x4_s8(uint32_t slice_base, svint8x4_t zn, svuint8_t zm)
 //
 void test_usmlall_single_x4_u8(uint32_t slice_base, svuint8x4_t zn, svint8_t zm) __arm_streaming __arm_shared_za
 {
-  SVE_ACLE_FUNC(svusmla,_single,_za32,_u8,_vg4x4)(slice_base, zn, zm);
+  SVE_ACLE_FUNC(svusmla_single_za32,,_u8,,_vg4x4)(slice_base, zn, zm);
 }
 
 //
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mls.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mls.c
index 6e48c330bbedfa..6830a399e91d64 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mls.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mls.c
@@ -36,7 +36,7 @@
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmls2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32x2_t zm) __arm_streaming __arm_shared_za {
-  SVE_ACLE_FUNC(svmls,,_za32,_f32,_vg1x2)(slice_base, zn, zm);
+  SVE_ACLE_FUNC(svmls_za32,,_f32,,_vg1x2)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmls4_f32(
@@ -66,7 +66,7 @@ void test_svmls2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32x2_t zm) __
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmls4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32x4_t zm) __arm_streaming __arm_shared_za {
-  SVE_ACLE_FUNC(svmls,,_za32,_f32,_vg1x4)(slice_base, zn, zm);
+  SVE_ACLE_FUNC(svmls_za32,,_f32,,_vg1x4)(slice_base, zn, zm);
 }
 
 //
@@ -86,7 +86,7 @@ void test_svmls4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32x4_t zm) __
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmls_single2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32_t zm) __arm_streaming __arm_shared_za {
-  SVE_ACLE_FUNC(svmls,_single,_za32,_f32,_vg1x2)(slice_base, zn, zm);
+  SVE_ACLE_FUNC(svmls_single_za32,,_f32,,_vg1x2)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmls_single4_f32(
@@ -108,7 +108,7 @@ void test_svmls_single2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32_t z
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmls_single4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32_t zm) __arm_streaming __arm_shared_za {
-  SVE_ACLE_FUNC(svmls,_single,_za32,_f32,_vg1x4)(slice_base, zn, zm);
+  SVE_ACLE_FUNC(svmls_single_za32,,_f32,,_vg1x4)(slice_base, zn, zm);
 }
 
 //
@@ -128,7 +128,7 @@ void test_svmls_single4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32_t z
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmls_lane2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32_t zm) __arm_streaming __arm_shared_za {
-  SVE_ACLE_FUNC(svmls_lane,,_za32,_f32,_vg1x2)(slice_base, zn, zm, 3);
+  SVE_ACLE_FUNC(svmls_lane_za32,,_f32,,_vg1x2)(slice_base, zn, zm, 3);
 }
 
 // CHECK-LABEL: @test_svmls_lane4_f32(
@@ -150,7 +150,7 @@ void test_svmls_lane2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32_t zm)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmls_lane4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32_t zm) __arm_streaming __arm_shared_za {
-  SVE_ACLE_FUNC(svmls_lane,,_za32,_f32,_vg1x4)(slice_base, zn, zm, 3);
+  SVE_ACLE_FUNC(svmls_lane_za32,,_f32,,_vg1x4)(slice_base, zn, zm, 3);
 }
 
 //
@@ -174,7 +174,7 @@ void test_svmls_lane4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32_t zm)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmls2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64x2_t zm) __arm_streaming __arm_shared_za {
-  SVE_ACLE_FUNC(svmls,,_za64,_f64,_vg1x2)(slice_base, zn, zm);
+  SVE_ACLE_FUNC(svmls_za64,,_f64,,_vg1x2)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmls4_f64(
@@ -204,7 +204,7 @@ void test_svmls2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64x2_t zm) __
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmls4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64x4_t zm) __arm_streaming __arm_shared_za {
-  SVE_ACLE_FUNC(svmls,,_za64,_f64,_vg1x4)(slice_base, zn, zm);
+  SVE_ACLE_FUNC(svmls_za64,,_f64,,_vg1x4)(slice_base, zn, zm);
 }
 
 //
@@ -224,7 +224,7 @@ void test_svmls4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64x4_t zm) __
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmls_single2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64_t zm) __arm_streaming __arm_shared_za {
-  SVE_ACLE_FUNC(svmls,_single,_za64,_f64,_vg1x2)(slice_base, zn, zm);
+  SVE_ACLE_FUNC(svmls_single_za64,,_f64,,_vg1x2)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmls_single4_f64(
@@ -246,7 +246,7 @@ void test_svmls_single2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64_t z
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmls_single4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64_t zm) __arm_streaming __arm_shared_za {
-  SVE_ACLE_FUNC(svmls,_single,_za64,_f64,_vg1x4)(slice_base, zn, zm);
+  SVE_ACLE_FUNC(svmls_single_za64,,_f64,,_vg1x4)(slice_base, zn, zm);
 }
 
 //
@@ -266,7 +266,7 @@ void test_svmls_single4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64_t z
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmls_lane2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64_t zm) __arm_streaming __arm_shared_za {
-  SVE_ACLE_FUNC(svmls_lane,,_za64,_f64,_vg1x2)(slice_base, zn, zm, 1);
+  SVE_ACLE_FUNC(svmls_lane_za64,,_f64,,_vg1x2)(slice_base, zn, zm, 1);
 }
 
 // CHECK-LABEL: @test_svmls_lane4_f64(
@@ -288,5 +288,5 @@ void test_svmls_lane2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64_t zm)
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svmls_lane4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64_t zm) __arm_streaming __arm_shared_za {
-  SVE_ACLE_FUNC(svmls_lane,,_za64,_f64,_vg1x4)(slice_base, zn, zm, 1);
+  SVE_ACLE_FUNC(svmls_lane_za64,,_f64,,_vg1x4)(slice_base, zn, zm, 1);
 }
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlsl.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlsl.c
index 1368ffe52c1ea5..0a87d97f649c90 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlsl.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlsl.c
@@ -311,7 +311,7 @@ void test_svmls_single1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __a
 //
 void test_svmls_single2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmls,_single,_za32,_f16,_vg2x2)(slice_base, zn, zm);
+   SVE_ACLE_FUNC(svmls_single_za32,,_f16,,_vg2x2)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmls_single2_bf16(
@@ -330,7 +330,7 @@ void test_svmls_single2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t z
 //
 void test_svmls_single2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmls,_single,_za32,_bf16,_vg2x2)(slice_base, zn, zm);
+   SVE_ACLE_FUNC(svmls_single_za32,,_bf16,,_vg2x2)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmls_single2_u16(
@@ -349,7 +349,7 @@ void test_svmls_single2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_
 //
 void test_svmls_single2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmls,_single,_za32,_u16,_vg2x2)(slice_base, zn, zm);
+   SVE_ACLE_FUNC(svmls_single_za32,,_u16,,_vg2x2)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmls_single2_s16(
@@ -368,7 +368,7 @@ void test_svmls_single2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm)
 //
 void test_svmls_single2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmls,_single,_za32,_s16,_vg2x2)(slice_base, zn, zm);
+   SVE_ACLE_FUNC(svmls_single_za32,,_s16,,_vg2x2)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmls_single4_f16(
@@ -391,7 +391,7 @@ void test_svmls_single2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) _
 //
 void test_svmls_single4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmls,_single,_za32,_f16,_vg2x4)(slice_base, zn, zm);
+   SVE_ACLE_FUNC(svmls_single_za32,,_f16,,_vg2x4)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmls_single4_bf16(
@@ -414,7 +414,7 @@ void test_svmls_single4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t z
 //
 void test_svmls_single4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmls,_single,_za32,_bf16,_vg2x4)(slice_base, zn, zm);
+   SVE_ACLE_FUNC(svmls_single_za32,,_bf16,,_vg2x4)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmls_single4_u16(
@@ -437,7 +437,7 @@ void test_svmls_single4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_
 //
 void test_svmls_single4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmls,_single,_za32,_u16,_vg2x4)(slice_base, zn, zm);
+   SVE_ACLE_FUNC(svmls_single_za32,,_u16,,_vg2x4)(slice_base, zn, zm);
 }
 
 // CHECK-LABEL: @test_svmls_single4_s16(
@@ -460,7 +460,7 @@ void test_svmls_single4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm)
 //
 void test_svmls_single4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmls,_single,_za32,_s16,_vg2x4)(slice_base, zn, zm);
+   SVE_ACLE_FUNC(svmls_single_za32,,_s16,,_vg2x4)(slice_base, zn, zm);
 }
 
 //
@@ -479,7 +479,7 @@ void test_svmls_single4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) _
 //
 void test_svmls_lane1_f16(uint32_t slice_base, svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmls_lane,,_za32,_f16,_vg2x1)(slice_base, zn, zm, 7);
+   SVE_ACLE_FUNC(svmls_lane_za32,,_f16,,_vg2x1)(slice_base, zn, zm, 7);
 }
 
 // CHECK-LABEL: @test_svmls_lane1_bf16(
@@ -494,7 +494,7 @@ void test_svmls_lane1_f16(uint32_t slice_base, svfloat16_t zn, svfloat16_t zm) _
 //
 void test_svmls_lane1_bf16(uint32_t slice_base, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmls_lane,,_za32,_bf16,_vg2x1)(slice_base, zn, zm, 7);
+   SVE_ACLE_FUNC(svmls_lane_za32,,_bf16,,_vg2x1)(slice_base, zn, zm, 7);
 }
 
 // CHECK-LABEL: @test_svmls_lane1_u16(
@@ -509,7 +509,7 @@ void test_svmls_lane1_bf16(uint32_t slice_base, svbfloat16_t zn, svbfloat16_t zm
 //
 void test_svmls_lane1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmls_lane,,_za32,_u16,_vg2x1)(slice_base, zn, zm, 7);
+   SVE_ACLE_FUNC(svmls_lane_za32,,_u16,,_vg2x1)(slice_base, zn, zm, 7);
 }
 
 // CHECK-LABEL: @test_svmls_lane1_s16(
@@ -524,7 +524,7 @@ void test_svmls_lane1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) __a
 //
 void test_svmls_lane1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmls_lane,,_za32,_s16,_vg2x1)(slice_base, zn, zm, 7);
+   SVE_ACLE_FUNC(svmls_lane_za32,,_s16,,_vg2x1)(slice_base, zn, zm, 7);
 }
 
 // CHECK-LABEL: @test_svmls_lane2_f16(
@@ -543,7 +543,7 @@ void test_svmls_lane1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __arm
 //
 void test_svmls_lane2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmls_lane,,_za32,_f16,_vg2x2)(slice_base, zn, zm, 7);
+   SVE_ACLE_FUNC(svmls_lane_za32,,_f16,,_vg2x2)(slice_base, zn, zm, 7);
 }
 
 // CHECK-LABEL: @test_svmls_lane2_bf16(
@@ -562,7 +562,7 @@ void test_svmls_lane2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm)
 //
 void test_svmls_lane2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmls_lane,,_za32,_bf16,_vg2x2)(slice_base, zn, zm, 7);
+   SVE_ACLE_FUNC(svmls_lane_za32,,_bf16,,_vg2x2)(slice_base, zn, zm, 7);
 }
 
 // CHECK-LABEL: @test_svmls_lane2_u16(
@@ -581,7 +581,7 @@ void test_svmls_lane2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t
 //
 void test_svmls_lane2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmls_lane,,_za32,_u16,_vg2x2)(slice_base, zn, zm, 7);
+   SVE_ACLE_FUNC(svmls_lane_za32,,_u16,,_vg2x2)(slice_base, zn, zm, 7);
 }
 
 // CHECK-LABEL: @test_svmls_lane2_s16(
@@ -600,7 +600,7 @@ void test_svmls_lane2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) _
 //
 void test_svmls_lane2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmls_lane,,_za32,_s16,_vg2x2)(slice_base, zn, zm, 7);
+   SVE_ACLE_FUNC(svmls_lane_za32,,_s16,,_vg2x2)(slice_base, zn, zm, 7);
 }
 
 // CHECK-LABEL: @test_svmls_lane4_f16(
@@ -623,7 +623,7 @@ void test_svmls_lane2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __a
 //
 void test_svmls_lane4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmls_lane,,_za32,_f16,_vg2x4)(slice_base, zn, zm, 7);
+   SVE_ACLE_FUNC(svmls_lane_za32,,_f16,,_vg2x4)(slice_base, zn, zm, 7);
 }
 
 // CHECK-LABEL: @test_svmls_lane4_bf16(
@@ -646,7 +646,7 @@ void test_svmls_lane4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t zm)
 //
 void test_svmls_lane4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmls_lane,,_za32,_bf16,_vg2x4)(slice_base, zn, zm, 7);
+   SVE_ACLE_FUNC(svmls_lane_za32,,_bf16,,_vg2x4)(slice_base, zn, zm, 7);
 }
 
 // CHECK-LABEL: @test_svmls_lane4_u16(
@@ -669,7 +669,7 @@ void test_svmls_lane4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_t
 //
 void test_svmls_lane4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmls_lane,,_za32,_u16,_vg2x4)(slice_base, zn, zm, 7);
+   SVE_ACLE_FUNC(svmls_lane_za32,,_u16,,_vg2x4)(slice_base, zn, zm, 7);
 }
 
 // CHECK-LABEL: @test_svmls_lane4_s16(
@@ -692,5 +692,5 @@ void test_svmls_lane4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) _
 //
 void test_svmls_lane4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za
 {
-   SVE_ACLE_FUNC(svmls_lane,,_za32,_s16,_vg2x4)(slice_base, zn, zm, 7);
+   SVE_ACLE_FUNC(svmls_lane_za32,,_s16,,_vg2x4)(slice_base, zn, zm, 7);
 }