[clang] [llvm] [Clang][AArch64][SVE2p3][SME2p3] Add intrinsics for v9.7a Two-way signed/unsigned absolute difference sum and accumulate long ops (PR #188972)

Mon Apr 27 07:44:26 PDT 2026

https://github.com/amilendra updated https://github.com/llvm/llvm-project/pull/188972

>From f127dbfdebd63e439d0531d073a940b4846d0c1c Mon Sep 17 00:00:00 2001
From: Amilendra Kodithuwakku <amilendra.kodithuwakku at arm.com>
Date: Fri, 24 Apr 2026 13:57:09 +0100
Subject: [PATCH 1/2] [Clang][AArch64][SVE2p3][SME2p3] Add intrinsics for v9.7a
 Two-way signed/unsigned absolute difference sum and accumulate long ops

Add the following new clang intrinsics based on the ACLE specification https://github.com/ARM-software/acle/pull/428 (Add alpha support for 9.7 data processing intrinsics)

SABAL (Two-way signed absolute difference sum and accumulate long)
  - svint16_t svabal[_s16](svint16_t, svint8_t, svint8_t)   / svint16_t svabal[_n_s16](svint16_t, svint8_t, int8_t)
  - svint32_t svabal[_s32](svint32_t, svint16_t, svint16_t) / svint32_t svabal[_n_s32](svint32_t, svint16_t, int16_t)
  - svint64_t svabal[_s64](svint64_t, svint32_t, svint32_t) / svint64_t svabal[_n_s64](svint64_t, svint32_t, int32_t)

UABAL (Two-way unsigned absolute difference sum and accumulate long )
  - svuint16_t svabal[_u16](svuint16_t, svuint8_t, svuint8_t)   / svuint16_t svabal[_n_u16](svuint16_t, svuint8_t, uint8_t)
  - svuint32_t svabal[_u32](svuint32_t, svuint16_t, svuint16_t) / svuint32_t svabal[_n_u32](svuint32_t, svuint16_t, uint16_t)
  - svuint64_t svabal[_u64](svuint64_t, svuint32_t, svuint32_t) / svuint64_t svabal[_n_u64](svuint64_t, svuint32_t, uint32_t)
---
 clang/include/clang/Basic/arm_sve.td          |  11 +
 .../sve2p3-intrinsics/acle_sve2p3_svabal.c    | 479 ++++++++++++++++++
 ...e2p3_RP___sme_AND_LP_sve2p3_OR_sme2p3_RP.c | 138 +++++
 .../aarch64-sve2p3-intrinsics/acle_sve2p3.cpp |  63 +++
 llvm/include/llvm/IR/IntrinsicsAArch64.td     |   2 +
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |   4 +-
 llvm/lib/Target/AArch64/SVEInstrFormats.td    |   7 +-
 .../sve2p3-intrinsics-abal.ll                 |  58 +++
 8 files changed, 759 insertions(+), 3 deletions(-)
 create mode 100644 clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_svabal.c
 create mode 100644 clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3.cpp
 create mode 100644 llvm/test/CodeGen/AArch64/sve2p3-intrinsics/sve2p3-intrinsics-abal.ll

diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index e4e8c848a7815..ead004757f179 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -1342,6 +1342,17 @@ defm SVRECPE  : SInstZPZ<"svrecpe",  "Ui",   "aarch64_sve_urecpe">;
 defm SVRSQRTE : SInstZPZ<"svrsqrte", "Ui",   "aarch64_sve_ursqrte">;
 }
 
+////////////////////////////////////////////////////////////////////////////////
+// SVE2.3 - Two-way signed/unsigned absolute difference sum and accumulate long
+
+let SVETargetGuard = "sve2p3|sme2p3", SMETargetGuard = "sve2p3|sme2p3" in {
+  def SVABAL_S : SInst<"svabal[_{d}]", "ddhh", "sil"   , MergeNone, "aarch64_sve_sabal", [VerifyRuntimeMode]>;
+  def SVABAL_S_N : SInst<"svabal[_n_{d}]", "ddhR", "sil"   , MergeNone, "aarch64_sve_sabal", [VerifyRuntimeMode]>;
+
+  def SVABAL_U : SInst<"svabal[_{d}]", "ddhh", "UsUiUl", MergeNone, "aarch64_sve_uabal", [VerifyRuntimeMode]>;
+  def SVABAL_U_N : SInst<"svabal[_n_{d}]", "ddhR", "UsUiUl", MergeNone, "aarch64_sve_uabal", [VerifyRuntimeMode]>;
+}
+
 //------------------------------------------------------------------------------
 
 multiclass SInstZPZxZ<string name, string types, string pat_v, string pat_n, string m_intrinsic, string x_intrinsic, list<FlagType> flags=[]> {
diff --git a/clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_svabal.c b/clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_svabal.c
new file mode 100644
index 0000000000000..8519b70bc6260
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_svabal.c
@@ -0,0 +1,479 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p3 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p3 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme                       -target-feature +sme2p3 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve                       -target-feature +sme2p3 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme                       -target-feature +sve2p3 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p3 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p3 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme                       -target-feature +sme2p3 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve                       -target-feature +sme2p3 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme                       -target-feature +sve2p3 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p3 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2p3 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+// REQUIRES: aarch64-registered-target
+
+
+#include <arm_sve.h>
+
+#if defined(__ARM_FEATURE_SME) && defined(__ARM_FEATURE_SVE)
+#define ATTR __arm_streaming_compatible
+#elif defined(__ARM_FEATURE_SME)
+#define ATTR __arm_streaming
+#else
+#define ATTR
+#endif
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3_UNUSED) A1
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3
+#endif
+
+// CHECK-LABEL: define dso_local <vscale x 8 x i16> @test_svabal_s16(
+// CHECK-SAME: <vscale x 8 x i16> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ZDA_ADDR:%.*]] = alloca <vscale x 8 x i16>, align 16
+// CHECK-NEXT:    [[ZN_ADDR:%.*]] = alloca <vscale x 16 x i8>, align 16
+// CHECK-NEXT:    [[ZM_ADDR:%.*]] = alloca <vscale x 16 x i8>, align 16
+// CHECK-NEXT:    store <vscale x 8 x i16> [[ZDA]], ptr [[ZDA_ADDR]], align 16
+// CHECK-NEXT:    store <vscale x 16 x i8> [[ZN]], ptr [[ZN_ADDR]], align 16
+// CHECK-NEXT:    store <vscale x 16 x i8> [[ZM]], ptr [[ZM_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <vscale x 8 x i16>, ptr [[ZDA_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 16 x i8>, ptr [[ZN_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 16 x i8>, ptr [[ZM_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.sabal.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]])
+// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP3]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i16> @_Z15test_svabal_s16u11__SVInt16_tu10__SVInt8_tS0_(
+// CPP-CHECK-SAME: <vscale x 8 x i16> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    [[ZDA_ADDR:%.*]] = alloca <vscale x 8 x i16>, align 16
+// CPP-CHECK-NEXT:    [[ZN_ADDR:%.*]] = alloca <vscale x 16 x i8>, align 16
+// CPP-CHECK-NEXT:    [[ZM_ADDR:%.*]] = alloca <vscale x 16 x i8>, align 16
+// CPP-CHECK-NEXT:    store <vscale x 8 x i16> [[ZDA]], ptr [[ZDA_ADDR]], align 16
+// CPP-CHECK-NEXT:    store <vscale x 16 x i8> [[ZN]], ptr [[ZN_ADDR]], align 16
+// CPP-CHECK-NEXT:    store <vscale x 16 x i8> [[ZM]], ptr [[ZM_ADDR]], align 16
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = load <vscale x 8 x i16>, ptr [[ZDA_ADDR]], align 16
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 16 x i8>, ptr [[ZN_ADDR]], align 16
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 16 x i8>, ptr [[ZM_ADDR]], align 16
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.sabal.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP3]]
+//
+svint16_t test_svabal_s16(svint16_t zda, svint8_t zn, svint8_t zm)  ATTR
+{
+  return SVE_ACLE_FUNC(svabal,,_s16)(zda, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x i16> @test_svabal_n_s16(
+// CHECK-SAME: <vscale x 8 x i16> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], i8 noundef [[ZM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ZDA_ADDR:%.*]] = alloca <vscale x 8 x i16>, align 16
+// CHECK-NEXT:    [[ZN_ADDR:%.*]] = alloca <vscale x 16 x i8>, align 16
+// CHECK-NEXT:    [[ZM_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    store <vscale x 8 x i16> [[ZDA]], ptr [[ZDA_ADDR]], align 16
+// CHECK-NEXT:    store <vscale x 16 x i8> [[ZN]], ptr [[ZN_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[ZM]], ptr [[ZM_ADDR]], align 1
+// CHECK-NEXT:    [[TMP0:%.*]] = load <vscale x 8 x i16>, ptr [[ZDA_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 16 x i8>, ptr [[ZN_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ZM_ADDR]], align 1
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP2]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.sabal.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP3]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i16> @_Z17test_svabal_n_s16u11__SVInt16_tu10__SVInt8_ta(
+// CPP-CHECK-SAME: <vscale x 8 x i16> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], i8 noundef [[ZM:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    [[ZDA_ADDR:%.*]] = alloca <vscale x 8 x i16>, align 16
+// CPP-CHECK-NEXT:    [[ZN_ADDR:%.*]] = alloca <vscale x 16 x i8>, align 16
+// CPP-CHECK-NEXT:    [[ZM_ADDR:%.*]] = alloca i8, align 1
+// CPP-CHECK-NEXT:    store <vscale x 8 x i16> [[ZDA]], ptr [[ZDA_ADDR]], align 16
+// CPP-CHECK-NEXT:    store <vscale x 16 x i8> [[ZN]], ptr [[ZN_ADDR]], align 16
+// CPP-CHECK-NEXT:    store i8 [[ZM]], ptr [[ZM_ADDR]], align 1
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = load <vscale x 8 x i16>, ptr [[ZDA_ADDR]], align 16
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 16 x i8>, ptr [[ZN_ADDR]], align 16
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ZM_ADDR]], align 1
+// CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP2]], i64 0
+// CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.sabal.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP3]]
+//
+svint16_t test_svabal_n_s16(svint16_t zda, svint8_t zn, int8_t zm)  ATTR
+{
+  return SVE_ACLE_FUNC(svabal,_n,_s16)(zda, zn, zm);
+}
+
+
+// CHECK-LABEL: define dso_local <vscale x 4 x i32> @test_svabal_s32(
+// CHECK-SAME: <vscale x 4 x i32> [[ZDA:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ZDA_ADDR:%.*]] = alloca <vscale x 4 x i32>, align 16
+// CHECK-NEXT:    [[ZN_ADDR:%.*]] = alloca <vscale x 8 x i16>, align 16
+// CHECK-NEXT:    [[ZM_ADDR:%.*]] = alloca <vscale x 8 x i16>, align 16
+// CHECK-NEXT:    store <vscale x 4 x i32> [[ZDA]], ptr [[ZDA_ADDR]], align 16
+// CHECK-NEXT:    store <vscale x 8 x i16> [[ZN]], ptr [[ZN_ADDR]], align 16
+// CHECK-NEXT:    store <vscale x 8 x i16> [[ZM]], ptr [[ZM_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <vscale x 4 x i32>, ptr [[ZDA_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 8 x i16>, ptr [[ZN_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 8 x i16>, ptr [[ZM_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.sabal.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]])
+// CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP3]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 4 x i32> @_Z15test_svabal_s32u11__SVInt32_tu11__SVInt16_tS0_(
+// CPP-CHECK-SAME: <vscale x 4 x i32> [[ZDA:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    [[ZDA_ADDR:%.*]] = alloca <vscale x 4 x i32>, align 16
+// CPP-CHECK-NEXT:    [[ZN_ADDR:%.*]] = alloca <vscale x 8 x i16>, align 16
+// CPP-CHECK-NEXT:    [[ZM_ADDR:%.*]] = alloca <vscale x 8 x i16>, align 16
+// CPP-CHECK-NEXT:    store <vscale x 4 x i32> [[ZDA]], ptr [[ZDA_ADDR]], align 16
+// CPP-CHECK-NEXT:    store <vscale x 8 x i16> [[ZN]], ptr [[ZN_ADDR]], align 16
+// CPP-CHECK-NEXT:    store <vscale x 8 x i16> [[ZM]], ptr [[ZM_ADDR]], align 16
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = load <vscale x 4 x i32>, ptr [[ZDA_ADDR]], align 16
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 8 x i16>, ptr [[ZN_ADDR]], align 16
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 8 x i16>, ptr [[ZM_ADDR]], align 16
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.sabal.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]])
+// CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP3]]
+//
+svint32_t test_svabal_s32(svint32_t zda, svint16_t zn, svint16_t zm)  ATTR
+{
+  return SVE_ACLE_FUNC(svabal,,_s32)(zda, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x i32> @test_svabal_n_s32(
+// CHECK-SAME: <vscale x 4 x i32> [[ZDA:%.*]], <vscale x 8 x i16> [[ZN:%.*]], i16 noundef [[ZM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ZDA_ADDR:%.*]] = alloca <vscale x 4 x i32>, align 16
+// CHECK-NEXT:    [[ZN_ADDR:%.*]] = alloca <vscale x 8 x i16>, align 16
+// CHECK-NEXT:    [[ZM_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    store <vscale x 4 x i32> [[ZDA]], ptr [[ZDA_ADDR]], align 16
+// CHECK-NEXT:    store <vscale x 8 x i16> [[ZN]], ptr [[ZN_ADDR]], align 16
+// CHECK-NEXT:    store i16 [[ZM]], ptr [[ZM_ADDR]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = load <vscale x 4 x i32>, ptr [[ZDA_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 8 x i16>, ptr [[ZN_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[ZM_ADDR]], align 2
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 [[TMP2]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i16> [[DOTSPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.sabal.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP3]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 4 x i32> @_Z17test_svabal_n_s32u11__SVInt32_tu11__SVInt16_ts(
+// CPP-CHECK-SAME: <vscale x 4 x i32> [[ZDA:%.*]], <vscale x 8 x i16> [[ZN:%.*]], i16 noundef [[ZM:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    [[ZDA_ADDR:%.*]] = alloca <vscale x 4 x i32>, align 16
+// CPP-CHECK-NEXT:    [[ZN_ADDR:%.*]] = alloca <vscale x 8 x i16>, align 16
+// CPP-CHECK-NEXT:    [[ZM_ADDR:%.*]] = alloca i16, align 2
+// CPP-CHECK-NEXT:    store <vscale x 4 x i32> [[ZDA]], ptr [[ZDA_ADDR]], align 16
+// CPP-CHECK-NEXT:    store <vscale x 8 x i16> [[ZN]], ptr [[ZN_ADDR]], align 16
+// CPP-CHECK-NEXT:    store i16 [[ZM]], ptr [[ZM_ADDR]], align 2
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = load <vscale x 4 x i32>, ptr [[ZDA_ADDR]], align 16
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 8 x i16>, ptr [[ZN_ADDR]], align 16
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[ZM_ADDR]], align 2
+// CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 [[TMP2]], i64 0
+// CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i16> [[DOTSPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.sabal.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP3]]
+//
+svint32_t test_svabal_n_s32(svint32_t zda, svint16_t zn, int16_t zm)  ATTR
+{
+  return SVE_ACLE_FUNC(svabal,_n,_s32)(zda, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 2 x i64> @test_svabal_s64(
+// CHECK-SAME: <vscale x 2 x i64> [[ZDA:%.*]], <vscale x 4 x i32> [[ZN:%.*]], <vscale x 4 x i32> [[ZM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ZDA_ADDR:%.*]] = alloca <vscale x 2 x i64>, align 16
+// CHECK-NEXT:    [[ZN_ADDR:%.*]] = alloca <vscale x 4 x i32>, align 16
+// CHECK-NEXT:    [[ZM_ADDR:%.*]] = alloca <vscale x 4 x i32>, align 16
+// CHECK-NEXT:    store <vscale x 2 x i64> [[ZDA]], ptr [[ZDA_ADDR]], align 16
+// CHECK-NEXT:    store <vscale x 4 x i32> [[ZN]], ptr [[ZN_ADDR]], align 16
+// CHECK-NEXT:    store <vscale x 4 x i32> [[ZM]], ptr [[ZM_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <vscale x 2 x i64>, ptr [[ZDA_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 4 x i32>, ptr [[ZN_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 4 x i32>, ptr [[ZM_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.sabal.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]])
+// CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP3]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 2 x i64> @_Z15test_svabal_s64u11__SVInt64_tu11__SVInt32_tS0_(
+// CPP-CHECK-SAME: <vscale x 2 x i64> [[ZDA:%.*]], <vscale x 4 x i32> [[ZN:%.*]], <vscale x 4 x i32> [[ZM:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    [[ZDA_ADDR:%.*]] = alloca <vscale x 2 x i64>, align 16
+// CPP-CHECK-NEXT:    [[ZN_ADDR:%.*]] = alloca <vscale x 4 x i32>, align 16
+// CPP-CHECK-NEXT:    [[ZM_ADDR:%.*]] = alloca <vscale x 4 x i32>, align 16
+// CPP-CHECK-NEXT:    store <vscale x 2 x i64> [[ZDA]], ptr [[ZDA_ADDR]], align 16
+// CPP-CHECK-NEXT:    store <vscale x 4 x i32> [[ZN]], ptr [[ZN_ADDR]], align 16
+// CPP-CHECK-NEXT:    store <vscale x 4 x i32> [[ZM]], ptr [[ZM_ADDR]], align 16
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = load <vscale x 2 x i64>, ptr [[ZDA_ADDR]], align 16
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 4 x i32>, ptr [[ZN_ADDR]], align 16
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 4 x i32>, ptr [[ZM_ADDR]], align 16
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.sabal.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]])
+// CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP3]]
+//
+svint64_t test_svabal_s64(svint64_t zda, svint32_t zn, svint32_t zm)  ATTR
+{
+  return SVE_ACLE_FUNC(svabal,,_s64)(zda, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 2 x i64> @test_svabal_n_s64(
+// CHECK-SAME: <vscale x 2 x i64> [[ZDA:%.*]], <vscale x 4 x i32> [[ZN:%.*]], i32 noundef [[ZM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ZDA_ADDR:%.*]] = alloca <vscale x 2 x i64>, align 16
+// CHECK-NEXT:    [[ZN_ADDR:%.*]] = alloca <vscale x 4 x i32>, align 16
+// CHECK-NEXT:    [[ZM_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store <vscale x 2 x i64> [[ZDA]], ptr [[ZDA_ADDR]], align 16
+// CHECK-NEXT:    store <vscale x 4 x i32> [[ZN]], ptr [[ZN_ADDR]], align 16
+// CHECK-NEXT:    store i32 [[ZM]], ptr [[ZM_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <vscale x 2 x i64>, ptr [[ZDA_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 4 x i32>, ptr [[ZN_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ZM_ADDR]], align 4
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP2]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.sabal.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP3]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 2 x i64> @_Z17test_svabal_n_s64u11__SVInt64_tu11__SVInt32_ti(
+// CPP-CHECK-SAME: <vscale x 2 x i64> [[ZDA:%.*]], <vscale x 4 x i32> [[ZN:%.*]], i32 noundef [[ZM:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    [[ZDA_ADDR:%.*]] = alloca <vscale x 2 x i64>, align 16
+// CPP-CHECK-NEXT:    [[ZN_ADDR:%.*]] = alloca <vscale x 4 x i32>, align 16
+// CPP-CHECK-NEXT:    [[ZM_ADDR:%.*]] = alloca i32, align 4
+// CPP-CHECK-NEXT:    store <vscale x 2 x i64> [[ZDA]], ptr [[ZDA_ADDR]], align 16
+// CPP-CHECK-NEXT:    store <vscale x 4 x i32> [[ZN]], ptr [[ZN_ADDR]], align 16
+// CPP-CHECK-NEXT:    store i32 [[ZM]], ptr [[ZM_ADDR]], align 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = load <vscale x 2 x i64>, ptr [[ZDA_ADDR]], align 16
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 4 x i32>, ptr [[ZN_ADDR]], align 16
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ZM_ADDR]], align 4
+// CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP2]], i64 0
+// CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.sabal.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP3]]
+//
+svint64_t test_svabal_n_s64(svint64_t zda, svint32_t zn, int32_t zm)  ATTR
+{
+  return SVE_ACLE_FUNC(svabal,_n,_s64)(zda, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x i16> @test_svabal_u16(
+// CHECK-SAME: <vscale x 8 x i16> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ZDA_ADDR:%.*]] = alloca <vscale x 8 x i16>, align 16
+// CHECK-NEXT:    [[ZN_ADDR:%.*]] = alloca <vscale x 16 x i8>, align 16
+// CHECK-NEXT:    [[ZM_ADDR:%.*]] = alloca <vscale x 16 x i8>, align 16
+// CHECK-NEXT:    store <vscale x 8 x i16> [[ZDA]], ptr [[ZDA_ADDR]], align 16
+// CHECK-NEXT:    store <vscale x 16 x i8> [[ZN]], ptr [[ZN_ADDR]], align 16
+// CHECK-NEXT:    store <vscale x 16 x i8> [[ZM]], ptr [[ZM_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <vscale x 8 x i16>, ptr [[ZDA_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 16 x i8>, ptr [[ZN_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 16 x i8>, ptr [[ZM_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.uabal.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]])
+// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP3]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i16> @_Z15test_svabal_u16u12__SVUint16_tu11__SVUint8_tS0_(
+// CPP-CHECK-SAME: <vscale x 8 x i16> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    [[ZDA_ADDR:%.*]] = alloca <vscale x 8 x i16>, align 16
+// CPP-CHECK-NEXT:    [[ZN_ADDR:%.*]] = alloca <vscale x 16 x i8>, align 16
+// CPP-CHECK-NEXT:    [[ZM_ADDR:%.*]] = alloca <vscale x 16 x i8>, align 16
+// CPP-CHECK-NEXT:    store <vscale x 8 x i16> [[ZDA]], ptr [[ZDA_ADDR]], align 16
+// CPP-CHECK-NEXT:    store <vscale x 16 x i8> [[ZN]], ptr [[ZN_ADDR]], align 16
+// CPP-CHECK-NEXT:    store <vscale x 16 x i8> [[ZM]], ptr [[ZM_ADDR]], align 16
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = load <vscale x 8 x i16>, ptr [[ZDA_ADDR]], align 16
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 16 x i8>, ptr [[ZN_ADDR]], align 16
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 16 x i8>, ptr [[ZM_ADDR]], align 16
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.uabal.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP3]]
+//
+svuint16_t test_svabal_u16(svuint16_t zda, svuint8_t zn, svuint8_t zm)  ATTR
+{
+  return SVE_ACLE_FUNC(svabal,,_u16)(zda, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x i16> @test_svabal_n_u16(
+// CHECK-SAME: <vscale x 8 x i16> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], i8 noundef [[ZM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ZDA_ADDR:%.*]] = alloca <vscale x 8 x i16>, align 16
+// CHECK-NEXT:    [[ZN_ADDR:%.*]] = alloca <vscale x 16 x i8>, align 16
+// CHECK-NEXT:    [[ZM_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    store <vscale x 8 x i16> [[ZDA]], ptr [[ZDA_ADDR]], align 16
+// CHECK-NEXT:    store <vscale x 16 x i8> [[ZN]], ptr [[ZN_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[ZM]], ptr [[ZM_ADDR]], align 1
+// CHECK-NEXT:    [[TMP0:%.*]] = load <vscale x 8 x i16>, ptr [[ZDA_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 16 x i8>, ptr [[ZN_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ZM_ADDR]], align 1
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP2]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.uabal.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP3]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i16> @_Z17test_svabal_n_u16u12__SVUint16_tu11__SVUint8_th(
+// CPP-CHECK-SAME: <vscale x 8 x i16> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], i8 noundef [[ZM:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    [[ZDA_ADDR:%.*]] = alloca <vscale x 8 x i16>, align 16
+// CPP-CHECK-NEXT:    [[ZN_ADDR:%.*]] = alloca <vscale x 16 x i8>, align 16
+// CPP-CHECK-NEXT:    [[ZM_ADDR:%.*]] = alloca i8, align 1
+// CPP-CHECK-NEXT:    store <vscale x 8 x i16> [[ZDA]], ptr [[ZDA_ADDR]], align 16
+// CPP-CHECK-NEXT:    store <vscale x 16 x i8> [[ZN]], ptr [[ZN_ADDR]], align 16
+// CPP-CHECK-NEXT:    store i8 [[ZM]], ptr [[ZM_ADDR]], align 1
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = load <vscale x 8 x i16>, ptr [[ZDA_ADDR]], align 16
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 16 x i8>, ptr [[ZN_ADDR]], align 16
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ZM_ADDR]], align 1
+// CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP2]], i64 0
+// CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.uabal.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP3]]
+//
+svuint16_t test_svabal_n_u16(svuint16_t zda, svuint8_t zn, uint8_t zm)  ATTR
+{
+  return SVE_ACLE_FUNC(svabal,_n,_u16)(zda, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x i32> @test_svabal_u32(
+// CHECK-SAME: <vscale x 4 x i32> [[ZDA:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ZDA_ADDR:%.*]] = alloca <vscale x 4 x i32>, align 16
+// CHECK-NEXT:    [[ZN_ADDR:%.*]] = alloca <vscale x 8 x i16>, align 16
+// CHECK-NEXT:    [[ZM_ADDR:%.*]] = alloca <vscale x 8 x i16>, align 16
+// CHECK-NEXT:    store <vscale x 4 x i32> [[ZDA]], ptr [[ZDA_ADDR]], align 16
+// CHECK-NEXT:    store <vscale x 8 x i16> [[ZN]], ptr [[ZN_ADDR]], align 16
+// CHECK-NEXT:    store <vscale x 8 x i16> [[ZM]], ptr [[ZM_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <vscale x 4 x i32>, ptr [[ZDA_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 8 x i16>, ptr [[ZN_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 8 x i16>, ptr [[ZM_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.uabal.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]])
+// CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP3]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 4 x i32> @_Z15test_svabal_u32u12__SVUint32_tu12__SVUint16_tS0_(
+// CPP-CHECK-SAME: <vscale x 4 x i32> [[ZDA:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    [[ZDA_ADDR:%.*]] = alloca <vscale x 4 x i32>, align 16
+// CPP-CHECK-NEXT:    [[ZN_ADDR:%.*]] = alloca <vscale x 8 x i16>, align 16
+// CPP-CHECK-NEXT:    [[ZM_ADDR:%.*]] = alloca <vscale x 8 x i16>, align 16
+// CPP-CHECK-NEXT:    store <vscale x 4 x i32> [[ZDA]], ptr [[ZDA_ADDR]], align 16
+// CPP-CHECK-NEXT:    store <vscale x 8 x i16> [[ZN]], ptr [[ZN_ADDR]], align 16
+// CPP-CHECK-NEXT:    store <vscale x 8 x i16> [[ZM]], ptr [[ZM_ADDR]], align 16
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = load <vscale x 4 x i32>, ptr [[ZDA_ADDR]], align 16
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 8 x i16>, ptr [[ZN_ADDR]], align 16
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 8 x i16>, ptr [[ZM_ADDR]], align 16
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.uabal.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]])
+// CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP3]]
+//
+svuint32_t test_svabal_u32(svuint32_t zda, svuint16_t zn, svuint16_t zm)  ATTR
+{
+  return SVE_ACLE_FUNC(svabal,,_u32)(zda, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x i32> @test_svabal_n_u32(
+// CHECK-SAME: <vscale x 4 x i32> [[ZDA:%.*]], <vscale x 8 x i16> [[ZN:%.*]], i16 noundef [[ZM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ZDA_ADDR:%.*]] = alloca <vscale x 4 x i32>, align 16
+// CHECK-NEXT:    [[ZN_ADDR:%.*]] = alloca <vscale x 8 x i16>, align 16
+// CHECK-NEXT:    [[ZM_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    store <vscale x 4 x i32> [[ZDA]], ptr [[ZDA_ADDR]], align 16
+// CHECK-NEXT:    store <vscale x 8 x i16> [[ZN]], ptr [[ZN_ADDR]], align 16
+// CHECK-NEXT:    store i16 [[ZM]], ptr [[ZM_ADDR]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = load <vscale x 4 x i32>, ptr [[ZDA_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 8 x i16>, ptr [[ZN_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[ZM_ADDR]], align 2
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 [[TMP2]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i16> [[DOTSPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.uabal.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP3]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 4 x i32> @_Z17test_svabal_n_u32u12__SVUint32_tu12__SVUint16_tt(
+// CPP-CHECK-SAME: <vscale x 4 x i32> [[ZDA:%.*]], <vscale x 8 x i16> [[ZN:%.*]], i16 noundef [[ZM:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    [[ZDA_ADDR:%.*]] = alloca <vscale x 4 x i32>, align 16
+// CPP-CHECK-NEXT:    [[ZN_ADDR:%.*]] = alloca <vscale x 8 x i16>, align 16
+// CPP-CHECK-NEXT:    [[ZM_ADDR:%.*]] = alloca i16, align 2
+// CPP-CHECK-NEXT:    store <vscale x 4 x i32> [[ZDA]], ptr [[ZDA_ADDR]], align 16
+// CPP-CHECK-NEXT:    store <vscale x 8 x i16> [[ZN]], ptr [[ZN_ADDR]], align 16
+// CPP-CHECK-NEXT:    store i16 [[ZM]], ptr [[ZM_ADDR]], align 2
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = load <vscale x 4 x i32>, ptr [[ZDA_ADDR]], align 16
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 8 x i16>, ptr [[ZN_ADDR]], align 16
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[ZM_ADDR]], align 2
+// CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 [[TMP2]], i64 0
+// CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i16> [[DOTSPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.uabal.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP3]]
+//
+svuint32_t test_svabal_n_u32(svuint32_t zda, svuint16_t zn, uint16_t zm)  ATTR
+{
+  return SVE_ACLE_FUNC(svabal,_n,_u32)(zda, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 2 x i64> @test_svabal_u64(
+// CHECK-SAME: <vscale x 2 x i64> [[ZDA:%.*]], <vscale x 4 x i32> [[ZN:%.*]], <vscale x 4 x i32> [[ZM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ZDA_ADDR:%.*]] = alloca <vscale x 2 x i64>, align 16
+// CHECK-NEXT:    [[ZN_ADDR:%.*]] = alloca <vscale x 4 x i32>, align 16
+// CHECK-NEXT:    [[ZM_ADDR:%.*]] = alloca <vscale x 4 x i32>, align 16
+// CHECK-NEXT:    store <vscale x 2 x i64> [[ZDA]], ptr [[ZDA_ADDR]], align 16
+// CHECK-NEXT:    store <vscale x 4 x i32> [[ZN]], ptr [[ZN_ADDR]], align 16
+// CHECK-NEXT:    store <vscale x 4 x i32> [[ZM]], ptr [[ZM_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <vscale x 2 x i64>, ptr [[ZDA_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 4 x i32>, ptr [[ZN_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 4 x i32>, ptr [[ZM_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.uabal.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]])
+// CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP3]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 2 x i64> @_Z15test_svabal_u64u12__SVUint64_tu12__SVUint32_tS0_(
+// CPP-CHECK-SAME: <vscale x 2 x i64> [[ZDA:%.*]], <vscale x 4 x i32> [[ZN:%.*]], <vscale x 4 x i32> [[ZM:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    [[ZDA_ADDR:%.*]] = alloca <vscale x 2 x i64>, align 16
+// CPP-CHECK-NEXT:    [[ZN_ADDR:%.*]] = alloca <vscale x 4 x i32>, align 16
+// CPP-CHECK-NEXT:    [[ZM_ADDR:%.*]] = alloca <vscale x 4 x i32>, align 16
+// CPP-CHECK-NEXT:    store <vscale x 2 x i64> [[ZDA]], ptr [[ZDA_ADDR]], align 16
+// CPP-CHECK-NEXT:    store <vscale x 4 x i32> [[ZN]], ptr [[ZN_ADDR]], align 16
+// CPP-CHECK-NEXT:    store <vscale x 4 x i32> [[ZM]], ptr [[ZM_ADDR]], align 16
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = load <vscale x 2 x i64>, ptr [[ZDA_ADDR]], align 16
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 4 x i32>, ptr [[ZN_ADDR]], align 16
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 4 x i32>, ptr [[ZM_ADDR]], align 16
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.uabal.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]])
+// CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP3]]
+//
+svuint64_t test_svabal_u64(svuint64_t zda, svuint32_t zn, svuint32_t zm)  ATTR
+{
+  return SVE_ACLE_FUNC(svabal,,_u64)(zda, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 2 x i64> @test_svabal_n_u64(
+// CHECK-SAME: <vscale x 2 x i64> [[ZDA:%.*]], <vscale x 4 x i32> [[ZN:%.*]], i32 noundef [[ZM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ZDA_ADDR:%.*]] = alloca <vscale x 2 x i64>, align 16
+// CHECK-NEXT:    [[ZN_ADDR:%.*]] = alloca <vscale x 4 x i32>, align 16
+// CHECK-NEXT:    [[ZM_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store <vscale x 2 x i64> [[ZDA]], ptr [[ZDA_ADDR]], align 16
+// CHECK-NEXT:    store <vscale x 4 x i32> [[ZN]], ptr [[ZN_ADDR]], align 16
+// CHECK-NEXT:    store i32 [[ZM]], ptr [[ZM_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <vscale x 2 x i64>, ptr [[ZDA_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 4 x i32>, ptr [[ZN_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ZM_ADDR]], align 4
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP2]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.uabal.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP3]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 2 x i64> @_Z17test_svabal_n_u64u12__SVUint64_tu12__SVUint32_tj(
+// CPP-CHECK-SAME: <vscale x 2 x i64> [[ZDA:%.*]], <vscale x 4 x i32> [[ZN:%.*]], i32 noundef [[ZM:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    [[ZDA_ADDR:%.*]] = alloca <vscale x 2 x i64>, align 16
+// CPP-CHECK-NEXT:    [[ZN_ADDR:%.*]] = alloca <vscale x 4 x i32>, align 16
+// CPP-CHECK-NEXT:    [[ZM_ADDR:%.*]] = alloca i32, align 4
+// CPP-CHECK-NEXT:    store <vscale x 2 x i64> [[ZDA]], ptr [[ZDA_ADDR]], align 16
+// CPP-CHECK-NEXT:    store <vscale x 4 x i32> [[ZN]], ptr [[ZN_ADDR]], align 16
+// CPP-CHECK-NEXT:    store i32 [[ZM]], ptr [[ZM_ADDR]], align 4
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = load <vscale x 2 x i64>, ptr [[ZDA_ADDR]], align 16
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 4 x i32>, ptr [[ZN_ADDR]], align 16
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ZM_ADDR]], align 4
+// CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP2]], i64 0
+// CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.uabal.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP3]]
+//
+svuint64_t test_svabal_n_u64(svuint64_t zda, svuint32_t zn, uint32_t zm)  ATTR
+{
+  return SVE_ACLE_FUNC(svabal,_n,_u64)(zda, zn, zm);
+}
diff --git a/clang/test/Sema/AArch64/arm_sve_feature_dependent_sve_AND_LP_sve2p3_OR_sme2p3_RP___sme_AND_LP_sve2p3_OR_sme2p3_RP.c b/clang/test/Sema/AArch64/arm_sve_feature_dependent_sve_AND_LP_sve2p3_OR_sme2p3_RP___sme_AND_LP_sve2p3_OR_sme2p3_RP.c
index 9aa4e34c9b8ac..00f6001bdb29e 100644
--- a/clang/test/Sema/AArch64/arm_sve_feature_dependent_sve_AND_LP_sve2p3_OR_sme2p3_RP___sme_AND_LP_sve2p3_OR_sme2p3_RP.c
+++ b/clang/test/Sema/AArch64/arm_sve_feature_dependent_sve_AND_LP_sve2p3_OR_sme2p3_RP___sme_AND_LP_sve2p3_OR_sme2p3_RP.c
@@ -10,6 +10,9 @@
 // Properties: guard="sve,(sve2p3|sme2p3)" streaming_guard="sme,(sve2p3|sme2p3)" flags="feature-dependent"
 
 void test(void) {
+  int8_t int8_t_val;
+  int16_t int16_t_val;
+  int32_t int32_t_val;
   svbool_t svbool_t_val;
   svint8_t svint8_t_val;
   svint16_t svint16_t_val;
@@ -19,7 +22,34 @@ void test(void) {
   svuint16_t svuint16_t_val;
   svuint32_t svuint32_t_val;
   svuint64_t svuint64_t_val;
+  uint8_t uint8_t_val;
+  uint16_t uint16_t_val;
+  uint32_t uint32_t_val;
 
+  svabal(svint16_t_val, svint8_t_val, int8_t_val);
+  svabal(svint16_t_val, svint8_t_val, svint8_t_val);
+  svabal(svint32_t_val, svint16_t_val, int16_t_val);
+  svabal(svint32_t_val, svint16_t_val, svint16_t_val);
+  svabal(svint64_t_val, svint32_t_val, int32_t_val);
+  svabal(svint64_t_val, svint32_t_val, svint32_t_val);
+  svabal(svuint16_t_val, svuint8_t_val, svuint8_t_val);
+  svabal(svuint16_t_val, svuint8_t_val, uint8_t_val);
+  svabal(svuint32_t_val, svuint16_t_val, svuint16_t_val);
+  svabal(svuint32_t_val, svuint16_t_val, uint16_t_val);
+  svabal(svuint64_t_val, svuint32_t_val, svuint32_t_val);
+  svabal(svuint64_t_val, svuint32_t_val, uint32_t_val);
+  svabal_n_s16(svint16_t_val, svint8_t_val, int8_t_val);
+  svabal_n_s32(svint32_t_val, svint16_t_val, int16_t_val);
+  svabal_n_s64(svint64_t_val, svint32_t_val, int32_t_val);
+  svabal_n_u16(svuint16_t_val, svuint8_t_val, uint8_t_val);
+  svabal_n_u32(svuint32_t_val, svuint16_t_val, uint16_t_val);
+  svabal_n_u64(svuint64_t_val, svuint32_t_val, uint32_t_val);
+  svabal_s16(svint16_t_val, svint8_t_val, svint8_t_val);
+  svabal_s32(svint32_t_val, svint16_t_val, svint16_t_val);
+  svabal_s64(svint64_t_val, svint32_t_val, svint32_t_val);
+  svabal_u16(svuint16_t_val, svuint8_t_val, svuint8_t_val);
+  svabal_u32(svuint32_t_val, svuint16_t_val, svuint16_t_val);
+  svabal_u64(svuint64_t_val, svuint32_t_val, svuint32_t_val);
   svaddqp(svint8_t_val, svint8_t_val);
   svaddqp(svint16_t_val, svint16_t_val);
   svaddqp(svint32_t_val, svint32_t_val);
@@ -62,20 +92,28 @@ void test(void) {
   svsubp_m(svbool_t_val, svuint64_t_val, svuint64_t_val);
   svsubp_s8_m(svbool_t_val, svint8_t_val, svint8_t_val);
   svsubp_s8_x(svbool_t_val, svint8_t_val, svint8_t_val);
+  svsubp_s8_z(svbool_t_val, svint8_t_val, svint8_t_val);
   svsubp_s16_m(svbool_t_val, svint16_t_val, svint16_t_val);
   svsubp_s16_x(svbool_t_val, svint16_t_val, svint16_t_val);
+  svsubp_s16_z(svbool_t_val, svint16_t_val, svint16_t_val);
   svsubp_s32_m(svbool_t_val, svint32_t_val, svint32_t_val);
   svsubp_s32_x(svbool_t_val, svint32_t_val, svint32_t_val);
+  svsubp_s32_z(svbool_t_val, svint32_t_val, svint32_t_val);
   svsubp_s64_m(svbool_t_val, svint64_t_val, svint64_t_val);
   svsubp_s64_x(svbool_t_val, svint64_t_val, svint64_t_val);
+  svsubp_s64_z(svbool_t_val, svint64_t_val, svint64_t_val);
   svsubp_u8_m(svbool_t_val, svuint8_t_val, svuint8_t_val);
   svsubp_u8_x(svbool_t_val, svuint8_t_val, svuint8_t_val);
+  svsubp_u8_z(svbool_t_val, svuint8_t_val, svuint8_t_val);
   svsubp_u16_m(svbool_t_val, svuint16_t_val, svuint16_t_val);
   svsubp_u16_x(svbool_t_val, svuint16_t_val, svuint16_t_val);
+  svsubp_u16_z(svbool_t_val, svuint16_t_val, svuint16_t_val);
   svsubp_u32_m(svbool_t_val, svuint32_t_val, svuint32_t_val);
   svsubp_u32_x(svbool_t_val, svuint32_t_val, svuint32_t_val);
+  svsubp_u32_z(svbool_t_val, svuint32_t_val, svuint32_t_val);
   svsubp_u64_m(svbool_t_val, svuint64_t_val, svuint64_t_val);
   svsubp_u64_x(svbool_t_val, svuint64_t_val, svuint64_t_val);
+  svsubp_u64_z(svbool_t_val, svuint64_t_val, svuint64_t_val);
   svsubp_x(svbool_t_val, svint8_t_val, svint8_t_val);
   svsubp_x(svbool_t_val, svint16_t_val, svint16_t_val);
   svsubp_x(svbool_t_val, svint32_t_val, svint32_t_val);
@@ -84,9 +122,20 @@ void test(void) {
   svsubp_x(svbool_t_val, svuint16_t_val, svuint16_t_val);
   svsubp_x(svbool_t_val, svuint32_t_val, svuint32_t_val);
   svsubp_x(svbool_t_val, svuint64_t_val, svuint64_t_val);
+  svsubp_z(svbool_t_val, svint8_t_val, svint8_t_val);
+  svsubp_z(svbool_t_val, svint16_t_val, svint16_t_val);
+  svsubp_z(svbool_t_val, svint32_t_val, svint32_t_val);
+  svsubp_z(svbool_t_val, svint64_t_val, svint64_t_val);
+  svsubp_z(svbool_t_val, svuint8_t_val, svuint8_t_val);
+  svsubp_z(svbool_t_val, svuint16_t_val, svuint16_t_val);
+  svsubp_z(svbool_t_val, svuint32_t_val, svuint32_t_val);
+  svsubp_z(svbool_t_val, svuint64_t_val, svuint64_t_val);
 }
 
 void test_streaming(void) __arm_streaming{
+  int8_t int8_t_val;
+  int16_t int16_t_val;
+  int32_t int32_t_val;
   svbool_t svbool_t_val;
   svint8_t svint8_t_val;
   svint16_t svint16_t_val;
@@ -96,7 +145,34 @@ void test_streaming(void) __arm_streaming{
   svuint16_t svuint16_t_val;
   svuint32_t svuint32_t_val;
   svuint64_t svuint64_t_val;
+  uint8_t uint8_t_val;
+  uint16_t uint16_t_val;
+  uint32_t uint32_t_val;
 
+  svabal(svint16_t_val, svint8_t_val, int8_t_val);
+  svabal(svint16_t_val, svint8_t_val, svint8_t_val);
+  svabal(svint32_t_val, svint16_t_val, int16_t_val);
+  svabal(svint32_t_val, svint16_t_val, svint16_t_val);
+  svabal(svint64_t_val, svint32_t_val, int32_t_val);
+  svabal(svint64_t_val, svint32_t_val, svint32_t_val);
+  svabal(svuint16_t_val, svuint8_t_val, svuint8_t_val);
+  svabal(svuint16_t_val, svuint8_t_val, uint8_t_val);
+  svabal(svuint32_t_val, svuint16_t_val, svuint16_t_val);
+  svabal(svuint32_t_val, svuint16_t_val, uint16_t_val);
+  svabal(svuint64_t_val, svuint32_t_val, svuint32_t_val);
+  svabal(svuint64_t_val, svuint32_t_val, uint32_t_val);
+  svabal_n_s16(svint16_t_val, svint8_t_val, int8_t_val);
+  svabal_n_s32(svint32_t_val, svint16_t_val, int16_t_val);
+  svabal_n_s64(svint64_t_val, svint32_t_val, int32_t_val);
+  svabal_n_u16(svuint16_t_val, svuint8_t_val, uint8_t_val);
+  svabal_n_u32(svuint32_t_val, svuint16_t_val, uint16_t_val);
+  svabal_n_u64(svuint64_t_val, svuint32_t_val, uint32_t_val);
+  svabal_s16(svint16_t_val, svint8_t_val, svint8_t_val);
+  svabal_s32(svint32_t_val, svint16_t_val, svint16_t_val);
+  svabal_s64(svint64_t_val, svint32_t_val, svint32_t_val);
+  svabal_u16(svuint16_t_val, svuint8_t_val, svuint8_t_val);
+  svabal_u32(svuint32_t_val, svuint16_t_val, svuint16_t_val);
+  svabal_u64(svuint64_t_val, svuint32_t_val, svuint32_t_val);
   svaddqp(svint8_t_val, svint8_t_val);
   svaddqp(svint16_t_val, svint16_t_val);
   svaddqp(svint32_t_val, svint32_t_val);
@@ -139,20 +215,28 @@ void test_streaming(void) __arm_streaming{
   svsubp_m(svbool_t_val, svuint64_t_val, svuint64_t_val);
   svsubp_s8_m(svbool_t_val, svint8_t_val, svint8_t_val);
   svsubp_s8_x(svbool_t_val, svint8_t_val, svint8_t_val);
+  svsubp_s8_z(svbool_t_val, svint8_t_val, svint8_t_val);
   svsubp_s16_m(svbool_t_val, svint16_t_val, svint16_t_val);
   svsubp_s16_x(svbool_t_val, svint16_t_val, svint16_t_val);
+  svsubp_s16_z(svbool_t_val, svint16_t_val, svint16_t_val);
   svsubp_s32_m(svbool_t_val, svint32_t_val, svint32_t_val);
   svsubp_s32_x(svbool_t_val, svint32_t_val, svint32_t_val);
+  svsubp_s32_z(svbool_t_val, svint32_t_val, svint32_t_val);
   svsubp_s64_m(svbool_t_val, svint64_t_val, svint64_t_val);
   svsubp_s64_x(svbool_t_val, svint64_t_val, svint64_t_val);
+  svsubp_s64_z(svbool_t_val, svint64_t_val, svint64_t_val);
   svsubp_u8_m(svbool_t_val, svuint8_t_val, svuint8_t_val);
   svsubp_u8_x(svbool_t_val, svuint8_t_val, svuint8_t_val);
+  svsubp_u8_z(svbool_t_val, svuint8_t_val, svuint8_t_val);
   svsubp_u16_m(svbool_t_val, svuint16_t_val, svuint16_t_val);
   svsubp_u16_x(svbool_t_val, svuint16_t_val, svuint16_t_val);
+  svsubp_u16_z(svbool_t_val, svuint16_t_val, svuint16_t_val);
   svsubp_u32_m(svbool_t_val, svuint32_t_val, svuint32_t_val);
   svsubp_u32_x(svbool_t_val, svuint32_t_val, svuint32_t_val);
+  svsubp_u32_z(svbool_t_val, svuint32_t_val, svuint32_t_val);
   svsubp_u64_m(svbool_t_val, svuint64_t_val, svuint64_t_val);
   svsubp_u64_x(svbool_t_val, svuint64_t_val, svuint64_t_val);
+  svsubp_u64_z(svbool_t_val, svuint64_t_val, svuint64_t_val);
   svsubp_x(svbool_t_val, svint8_t_val, svint8_t_val);
   svsubp_x(svbool_t_val, svint16_t_val, svint16_t_val);
   svsubp_x(svbool_t_val, svint32_t_val, svint32_t_val);
@@ -161,9 +245,20 @@ void test_streaming(void) __arm_streaming{
   svsubp_x(svbool_t_val, svuint16_t_val, svuint16_t_val);
   svsubp_x(svbool_t_val, svuint32_t_val, svuint32_t_val);
   svsubp_x(svbool_t_val, svuint64_t_val, svuint64_t_val);
+  svsubp_z(svbool_t_val, svint8_t_val, svint8_t_val);
+  svsubp_z(svbool_t_val, svint16_t_val, svint16_t_val);
+  svsubp_z(svbool_t_val, svint32_t_val, svint32_t_val);
+  svsubp_z(svbool_t_val, svint64_t_val, svint64_t_val);
+  svsubp_z(svbool_t_val, svuint8_t_val, svuint8_t_val);
+  svsubp_z(svbool_t_val, svuint16_t_val, svuint16_t_val);
+  svsubp_z(svbool_t_val, svuint32_t_val, svuint32_t_val);
+  svsubp_z(svbool_t_val, svuint64_t_val, svuint64_t_val);
 }
 
 void test_streaming_compatible(void) __arm_streaming_compatible{
+  int8_t int8_t_val;
+  int16_t int16_t_val;
+  int32_t int32_t_val;
   svbool_t svbool_t_val;
   svint8_t svint8_t_val;
   svint16_t svint16_t_val;
@@ -173,7 +268,34 @@ void test_streaming_compatible(void) __arm_streaming_compatible{
   svuint16_t svuint16_t_val;
   svuint32_t svuint32_t_val;
   svuint64_t svuint64_t_val;
+  uint8_t uint8_t_val;
+  uint16_t uint16_t_val;
+  uint32_t uint32_t_val;
 
+  svabal(svint16_t_val, svint8_t_val, int8_t_val);
+  svabal(svint16_t_val, svint8_t_val, svint8_t_val);
+  svabal(svint32_t_val, svint16_t_val, int16_t_val);
+  svabal(svint32_t_val, svint16_t_val, svint16_t_val);
+  svabal(svint64_t_val, svint32_t_val, int32_t_val);
+  svabal(svint64_t_val, svint32_t_val, svint32_t_val);
+  svabal(svuint16_t_val, svuint8_t_val, svuint8_t_val);
+  svabal(svuint16_t_val, svuint8_t_val, uint8_t_val);
+  svabal(svuint32_t_val, svuint16_t_val, svuint16_t_val);
+  svabal(svuint32_t_val, svuint16_t_val, uint16_t_val);
+  svabal(svuint64_t_val, svuint32_t_val, svuint32_t_val);
+  svabal(svuint64_t_val, svuint32_t_val, uint32_t_val);
+  svabal_n_s16(svint16_t_val, svint8_t_val, int8_t_val);
+  svabal_n_s32(svint32_t_val, svint16_t_val, int16_t_val);
+  svabal_n_s64(svint64_t_val, svint32_t_val, int32_t_val);
+  svabal_n_u16(svuint16_t_val, svuint8_t_val, uint8_t_val);
+  svabal_n_u32(svuint32_t_val, svuint16_t_val, uint16_t_val);
+  svabal_n_u64(svuint64_t_val, svuint32_t_val, uint32_t_val);
+  svabal_s16(svint16_t_val, svint8_t_val, svint8_t_val);
+  svabal_s32(svint32_t_val, svint16_t_val, svint16_t_val);
+  svabal_s64(svint64_t_val, svint32_t_val, svint32_t_val);
+  svabal_u16(svuint16_t_val, svuint8_t_val, svuint8_t_val);
+  svabal_u32(svuint32_t_val, svuint16_t_val, svuint16_t_val);
+  svabal_u64(svuint64_t_val, svuint32_t_val, svuint32_t_val);
   svaddqp(svint8_t_val, svint8_t_val);
   svaddqp(svint16_t_val, svint16_t_val);
   svaddqp(svint32_t_val, svint32_t_val);
@@ -216,20 +338,28 @@ void test_streaming_compatible(void) __arm_streaming_compatible{
   svsubp_m(svbool_t_val, svuint64_t_val, svuint64_t_val);
   svsubp_s8_m(svbool_t_val, svint8_t_val, svint8_t_val);
   svsubp_s8_x(svbool_t_val, svint8_t_val, svint8_t_val);
+  svsubp_s8_z(svbool_t_val, svint8_t_val, svint8_t_val);
   svsubp_s16_m(svbool_t_val, svint16_t_val, svint16_t_val);
   svsubp_s16_x(svbool_t_val, svint16_t_val, svint16_t_val);
+  svsubp_s16_z(svbool_t_val, svint16_t_val, svint16_t_val);
   svsubp_s32_m(svbool_t_val, svint32_t_val, svint32_t_val);
   svsubp_s32_x(svbool_t_val, svint32_t_val, svint32_t_val);
+  svsubp_s32_z(svbool_t_val, svint32_t_val, svint32_t_val);
   svsubp_s64_m(svbool_t_val, svint64_t_val, svint64_t_val);
   svsubp_s64_x(svbool_t_val, svint64_t_val, svint64_t_val);
+  svsubp_s64_z(svbool_t_val, svint64_t_val, svint64_t_val);
   svsubp_u8_m(svbool_t_val, svuint8_t_val, svuint8_t_val);
   svsubp_u8_x(svbool_t_val, svuint8_t_val, svuint8_t_val);
+  svsubp_u8_z(svbool_t_val, svuint8_t_val, svuint8_t_val);
   svsubp_u16_m(svbool_t_val, svuint16_t_val, svuint16_t_val);
   svsubp_u16_x(svbool_t_val, svuint16_t_val, svuint16_t_val);
+  svsubp_u16_z(svbool_t_val, svuint16_t_val, svuint16_t_val);
   svsubp_u32_m(svbool_t_val, svuint32_t_val, svuint32_t_val);
   svsubp_u32_x(svbool_t_val, svuint32_t_val, svuint32_t_val);
+  svsubp_u32_z(svbool_t_val, svuint32_t_val, svuint32_t_val);
   svsubp_u64_m(svbool_t_val, svuint64_t_val, svuint64_t_val);
   svsubp_u64_x(svbool_t_val, svuint64_t_val, svuint64_t_val);
+  svsubp_u64_z(svbool_t_val, svuint64_t_val, svuint64_t_val);
   svsubp_x(svbool_t_val, svint8_t_val, svint8_t_val);
   svsubp_x(svbool_t_val, svint16_t_val, svint16_t_val);
   svsubp_x(svbool_t_val, svint32_t_val, svint32_t_val);
@@ -238,4 +368,12 @@ void test_streaming_compatible(void) __arm_streaming_compatible{
   svsubp_x(svbool_t_val, svuint16_t_val, svuint16_t_val);
   svsubp_x(svbool_t_val, svuint32_t_val, svuint32_t_val);
   svsubp_x(svbool_t_val, svuint64_t_val, svuint64_t_val);
+  svsubp_z(svbool_t_val, svint8_t_val, svint8_t_val);
+  svsubp_z(svbool_t_val, svint16_t_val, svint16_t_val);
+  svsubp_z(svbool_t_val, svint32_t_val, svint32_t_val);
+  svsubp_z(svbool_t_val, svint64_t_val, svint64_t_val);
+  svsubp_z(svbool_t_val, svuint8_t_val, svuint8_t_val);
+  svsubp_z(svbool_t_val, svuint16_t_val, svuint16_t_val);
+  svsubp_z(svbool_t_val, svuint32_t_val, svuint32_t_val);
+  svsubp_z(svbool_t_val, svuint64_t_val, svuint64_t_val);
 }
diff --git a/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3.cpp b/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3.cpp
new file mode 100644
index 0000000000000..3a233aaddff31
--- /dev/null
+++ b/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3.cpp
@@ -0,0 +1,63 @@
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -verify -verify-ignore-unexpected=error,note -emit-llvm -o - %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -verify=overload -verify-ignore-unexpected=error,note -emit-llvm -o - %s
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3_UNUSED) A1
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3
+#endif
+
+void test_svabal(int8_t s8, int16_t s16, int32_t s32, uint8_t u8, uint16_t u16, uint32_t u32)
+{
+  // expected-error at +2 {{'svabal_s64' needs target feature (sve,(sve2p3|sme2p3))|(sme,(sve2p3|sme2p3))}}
+  // overload-error at +1 {{'svabal' needs target feature (sve,(sve2p3|sme2p3))|(sme,(sve2p3|sme2p3))}}
+  SVE_ACLE_FUNC(svabal,,_s64)(svundef_s64(), svundef_s32(), svundef_s32());
+
+  // expected-error at +2 {{'svabal_n_s64' needs target feature (sve,(sve2p3|sme2p3))|(sme,(sve2p3|sme2p3))}}
+  // overload-error at +1 {{'svabal' needs target feature (sve,(sve2p3|sme2p3))|(sme,(sve2p3|sme2p3))}}
+  SVE_ACLE_FUNC(svabal,_n,_s64)(svundef_s64(), svundef_s32(), s32);
+
+  // expected-error at +2 {{'svabal_s32' needs target feature (sve,(sve2p3|sme2p3))|(sme,(sve2p3|sme2p3))}}
+  // overload-error at +1 {{'svabal' needs target feature (sve,(sve2p3|sme2p3))|(sme,(sve2p3|sme2p3))}}
+  SVE_ACLE_FUNC(svabal,,_s32)(svundef_s32(), svundef_s16(), svundef_s16());
+
+  // expected-error at +2 {{'svabal_n_s32' needs target feature (sve,(sve2p3|sme2p3))|(sme,(sve2p3|sme2p3))}}
+  // overload-error at +1 {{'svabal' needs target feature (sve,(sve2p3|sme2p3))|(sme,(sve2p3|sme2p3))}}
+  SVE_ACLE_FUNC(svabal,_n,_s32)(svundef_s32(), svundef_s16(), s16);
+
+  // expected-error at +2 {{'svabal_s16' needs target feature (sve,(sve2p3|sme2p3))|(sme,(sve2p3|sme2p3))}}
+  // overload-error at +1 {{'svabal' needs target feature (sve,(sve2p3|sme2p3))|(sme,(sve2p3|sme2p3))}}
+  SVE_ACLE_FUNC(svabal,,_s16)(svundef_s16(), svundef_s8(), svundef_s8());
+
+  // expected-error at +2 {{'svabal_n_s16' needs target feature (sve,(sve2p3|sme2p3))|(sme,(sve2p3|sme2p3))}}
+  // overload-error at +1 {{'svabal' needs target feature (sve,(sve2p3|sme2p3))|(sme,(sve2p3|sme2p3))}}
+  SVE_ACLE_FUNC(svabal,_n,_s16)(svundef_s16(), svundef_s8(), s8);
+
+  // expected-error at +2 {{'svabal_u64' needs target feature (sve,(sve2p3|sme2p3))|(sme,(sve2p3|sme2p3))}}
+  // overload-error at +1 {{'svabal' needs target feature (sve,(sve2p3|sme2p3))|(sme,(sve2p3|sme2p3))}}
+  SVE_ACLE_FUNC(svabal,,_u64)(svundef_u64(), svundef_u32(), svundef_u32());
+
+  // expected-error at +2 {{'svabal_n_u64' needs target feature (sve,(sve2p3|sme2p3))|(sme,(sve2p3|sme2p3))}}
+  // overload-error at +1 {{'svabal' needs target feature (sve,(sve2p3|sme2p3))|(sme,(sve2p3|sme2p3))}}
+  SVE_ACLE_FUNC(svabal,_n,_u64)(svundef_u64(), svundef_u32(), u32);
+
+  // expected-error at +2 {{'svabal_u32' needs target feature (sve,(sve2p3|sme2p3))|(sme,(sve2p3|sme2p3))}}
+  // overload-error at +1 {{'svabal' needs target feature (sve,(sve2p3|sme2p3))|(sme,(sve2p3|sme2p3))}}
+  SVE_ACLE_FUNC(svabal,,_u32)(svundef_u32(), svundef_u16(), svundef_u16());
+
+  // expected-error at +2 {{'svabal_n_u32' needs target feature (sve,(sve2p3|sme2p3))|(sme,(sve2p3|sme2p3))}}
+  // overload-error at +1 {{'svabal' needs target feature (sve,(sve2p3|sme2p3))|(sme,(sve2p3|sme2p3))}}
+  SVE_ACLE_FUNC(svabal,_n,_u32)(svundef_u32(), svundef_u16(), u16);
+
+  // expected-error at +2 {{'svabal_u16' needs target feature (sve,(sve2p3|sme2p3))|(sme,(sve2p3|sme2p3))}}
+  // overload-error at +1 {{'svabal' needs target feature (sve,(sve2p3|sme2p3))|(sme,(sve2p3|sme2p3))}}
+  SVE_ACLE_FUNC(svabal,,_u16)(svundef_u16(), svundef_u8(), svundef_u8());
+
+  // expected-error at +2 {{'svabal_n_u16' needs target feature (sve,(sve2p3|sme2p3))|(sme,(sve2p3|sme2p3))}}
+  // overload-error at +1 {{'svabal' needs target feature (sve,(sve2p3|sme2p3))|(sme,(sve2p3|sme2p3))}}
+  SVE_ACLE_FUNC(svabal,_n,_u16)(svundef_u16(), svundef_u8(), u8);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 8db96c1f731b9..2204c05860693 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -2458,6 +2458,8 @@ def int_aarch64_sve_stnt1_scatter_scalar_offset  : AdvSIMD_ScatterStore_VS_Intri
 //
 
 def int_aarch64_sve_saba          : AdvSIMD_3VectorArg_Intrinsic<[IntrSpeculatable]>;
+def int_aarch64_sve_sabal         : SVE2_3VectorArg_Long_Intrinsic<[IntrSpeculatable]>;
+def int_aarch64_sve_uabal         : SVE2_3VectorArg_Long_Intrinsic<[IntrSpeculatable]>;
 def int_aarch64_sve_shadd         : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>;
 def int_aarch64_sve_shsub         : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>;
 def int_aarch64_sve_shsub_u       : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>;
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 4bdc1d1c00a26..c0651c80b63ab 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -4841,8 +4841,8 @@ let Predicates = [HasSVE2p3_or_SME2p3] in {
   defm SUBP_ZPmZZ    : sve2_int_arith_pred<0b100001, "subp", int_aarch64_sve_subp>;
 
   // SVE2 integer absolute difference and accumulate long
-  defm SABAL_ZZZ : sve2_int_two_way_absdiff_accum_long<0b0, "sabal">;
-  defm UABAL_ZZZ : sve2_int_two_way_absdiff_accum_long<0b1, "uabal">;
+  defm SABAL_ZZZ : sve2_int_two_way_absdiff_accum_long<0b0, "sabal", int_aarch64_sve_sabal>;
+  defm UABAL_ZZZ : sve2_int_two_way_absdiff_accum_long<0b1, "uabal", int_aarch64_sve_uabal>;
 
   // SVE2 integer dot product
   def SDOT_ZZZ_BtoH : sve_intx_dot<0b01, 0b00000, 0b0, "sdot", ZPR16, ZPR8>;
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 8f5d3556cde17..0e6eb52a7a6c8 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -4695,10 +4695,15 @@ multiclass sve2_int_absdiff_accum_long<bits<2> opc, string asm,
   def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _D)>;
 }
 
-multiclass sve2_int_two_way_absdiff_accum_long<bit U, string asm> {
+multiclass sve2_int_two_way_absdiff_accum_long<bit U, string asm,
+                                               SDPatternOperator op> {
   def _BtoH : sve2_int_absdiff_accum<0b001, { 0b01, U, 0b1 }, asm, ZPR16, ZPR8>;
   def _HtoS : sve2_int_absdiff_accum<0b010, { 0b01, U, 0b1 }, asm, ZPR32, ZPR16>;
   def _StoD : sve2_int_absdiff_accum<0b011, { 0b01, U, 0b1 }, asm, ZPR64, ZPR32>;
+
+  def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _BtoH)>;
+  def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _HtoS)>;
+  def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _StoD)>;
 }
 
 multiclass sve2_int_addsub_long_carry<bits<2> opc, string asm,
diff --git a/llvm/test/CodeGen/AArch64/sve2p3-intrinsics/sve2p3-intrinsics-abal.ll b/llvm/test/CodeGen/AArch64/sve2p3-intrinsics/sve2p3-intrinsics-abal.ll
new file mode 100644
index 0000000000000..6988a8ce67055
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p3-intrinsics/sve2p3-intrinsics-abal.ll
@@ -0,0 +1,58 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p3 -enable-subreg-liveness -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2p3 -enable-subreg-liveness -force-streaming -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme,+sve2p3 -enable-subreg-liveness -force-streaming -verify-machineinstrs < %s | FileCheck %s
+
+define <vscale x 8 x i16> @test_svabal_s16(<vscale x 8 x i16> %zda, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) {
+; CHECK-LABEL: test_svabal_s16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sabal z0.h, z1.b, z1.b
+; CHECK-NEXT:    ret
+ %res = call <vscale x 8 x i16> @llvm.aarch64.sve.sabal.nxv8i16(<vscale x 8 x i16> %zda, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zn)
+ ret <vscale x 8 x i16> %res
+}
+
+define <vscale x 4 x i32> @test_svabal_s32(<vscale x 4 x i32> %zda, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) {
+; CHECK-LABEL: test_svabal_s32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sabal z0.s, z1.h, z1.h
+; CHECK-NEXT:    ret
+ %res = call <vscale x 4 x i32> @llvm.aarch64.sve.sabal.nxv4i32(<vscale x 4 x i32> %zda, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zn)
+ ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 2 x i64> @test_svabal_s64(<vscale x 2 x i64> %zda, <vscale x 4 x i32> %zn, <vscale x 4 x i32> %zm) {
+; CHECK-LABEL: test_svabal_s64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sabal z0.d, z1.s, z1.s
+; CHECK-NEXT:    ret
+  %res = call <vscale x 2 x i64> @llvm.aarch64.sve.sabal.nxv2i64(<vscale x 2 x i64> %zda, <vscale x 4 x i32> %zn, <vscale x 4 x i32> %zn)
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 8 x i16> @test_svabal_u16(<vscale x 8 x i16> %zda, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) {
+; CHECK-LABEL: test_svabal_u16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uabal z0.h, z1.b, z1.b
+; CHECK-NEXT:    ret
+  %res = call <vscale x 8 x i16> @llvm.aarch64.sve.uabal.nxv8i16(<vscale x 8 x i16> %zda, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zn)
+  ret <vscale x 8 x i16> %res
+}
+
+define <vscale x 4 x i32> @test_svabal_u32(<vscale x 4 x i32> %zda, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) {
+; CHECK-LABEL: test_svabal_u32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uabal z0.s, z1.h, z1.h
+; CHECK-NEXT:    ret
+  %res = call <vscale x 4 x i32> @llvm.aarch64.sve.uabal.nxv4i32(<vscale x 4 x i32> %zda, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zn)
+  ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 2 x i64> @test_svabal_u64(<vscale x 2 x i64> %zda, <vscale x 4 x i32> %zn, <vscale x 4 x i32> %zm) {
+; CHECK-LABEL: test_svabal_u64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uabal z0.d, z1.s, z1.s
+; CHECK-NEXT:    ret
+  %res = call <vscale x 2 x i64> @llvm.aarch64.sve.uabal.nxv2i64(<vscale x 2 x i64> %zda, <vscale x 4 x i32> %zn, <vscale x 4 x i32> %zn)
+  ret <vscale x 2 x i64> %res
+}

>From ba8f95194f7cbb5f8ce17b3e1d1d6bf77d44eb0c Mon Sep 17 00:00:00 2001
From: Amilendra Kodithuwakku <amilendra.kodithuwakku at arm.com>
Date: Mon, 27 Apr 2026 15:34:49 +0100
Subject: [PATCH 2/2] Remove -enable-subreg-liveness from the -force-streaming
 tests

---
 .../AArch64/sve2p3-intrinsics/sve2p3-intrinsics-abal.ll       | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/sve2p3-intrinsics/sve2p3-intrinsics-abal.ll b/llvm/test/CodeGen/AArch64/sve2p3-intrinsics/sve2p3-intrinsics-abal.ll
index 6988a8ce67055..28841fba3e557 100644
--- a/llvm/test/CodeGen/AArch64/sve2p3-intrinsics/sve2p3-intrinsics-abal.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p3-intrinsics/sve2p3-intrinsics-abal.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p3 -enable-subreg-liveness -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2p3 -enable-subreg-liveness -force-streaming -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme,+sve2p3 -enable-subreg-liveness -force-streaming -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2p3 -force-streaming -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme,+sve2p3 -force-streaming -verify-machineinstrs < %s | FileCheck %s
 
 define <vscale x 8 x i16> @test_svabal_s16(<vscale x 8 x i16> %zda, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) {
 ; CHECK-LABEL: test_svabal_s16: