[clang] [AArch64] Add SVE2.1 intrinsics for indexed quadword gather loads and scatter stores (PR #70476)
Momchil Velikov via cfe-commits
cfe-commits at lists.llvm.org
Fri Oct 27 09:42:21 PDT 2023
https://github.com/momchil-velikov created https://github.com/llvm/llvm-project/pull/70476
[Note, first patch is a part of another pull request]
This patch adds the quadword gather load intrinsics of the form
```
(1) sv<type>_t svld1q_gather_u64index_<typ>(svbool_t, const <type>_t *, svuint64_t);
(2) sv<type>_t svld1q_gather_u64base_index_<typ>(svbool_t, svuint64_t, int64_t);
```
and the quadword scatter store intrinsics of the form
```
(3) void svst1q_scatter_u64index_<typ>(svbool_t, <type>_t *, svuint64_t, sv<type>_t);
(4) void svst1q_scatter_u64base_index_<typ>(svbool, svuint64_t, int64_t, sv<type>_t);
```
(intrinsics (1) and (3) are currently missing the variants for non 64-bit sized base types, e.g. `int8_t` or `bfloat16_t`, etc).
ACLE spec: https://github.com/ARM-software/acle/pull/257
>From 1c47d21a08de2fde89fe28e94a305cd3bcdead85 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Thu, 26 Oct 2023 20:12:20 +0100
Subject: [PATCH 1/2] SVE 128-bit loads/stores
---
clang/include/clang/Basic/arm_sve.td | 46 +
clang/lib/CodeGen/CGBuiltin.cpp | 62 +-
.../acle_sve2p1_ld1_single.c | 255 ++
.../acle_sve2p1_loads.c | 2495 +++++++++++++++++
.../acle_sve2p1_st1_single.c | 255 ++
.../acle_sve2p1_store.c | 2124 ++++++++++++++
llvm/include/llvm/IR/IntrinsicsAArch64.td | 36 +
.../Target/AArch64/AArch64ISelDAGToDAG.cpp | 44 +-
.../Target/AArch64/AArch64ISelLowering.cpp | 11 +-
llvm/lib/Target/AArch64/AArch64ISelLowering.h | 2 +
.../lib/Target/AArch64/AArch64SVEInstrInfo.td | 52 +-
llvm/lib/Target/AArch64/SVEInstrFormats.td | 41 +-
...ics-gather-loads-128bit-unscaled-offset.ll | 105 +
.../AArch64/sve2p1-intrinsics-ld1-single.ll | 144 +
.../sve2p1-intrinsics-multivec-loads.ll | 797 ++++++
.../sve2p1-intrinsics-multivec-stores.ll | 910 ++++++
...s-scatter-stores-128bit-unscaled-offset.ll | 113 +
.../AArch64/sve2p1-intrinsics-st1-single.ll | 130 +
18 files changed, 7606 insertions(+), 16 deletions(-)
create mode 100644 clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ld1_single.c
create mode 100644 clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_loads.c
create mode 100644 clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_st1_single.c
create mode 100644 clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_store.c
create mode 100644 llvm/test/CodeGen/AArch64/sve2p1-intrinsics-gather-loads-128bit-unscaled-offset.ll
create mode 100644 llvm/test/CodeGen/AArch64/sve2p1-intrinsics-ld1-single.ll
create mode 100644 llvm/test/CodeGen/AArch64/sve2p1-intrinsics-multivec-loads.ll
create mode 100644 llvm/test/CodeGen/AArch64/sve2p1-intrinsics-multivec-stores.ll
create mode 100644 llvm/test/CodeGen/AArch64/sve2p1-intrinsics-scatter-stores-128bit-unscaled-offset.ll
create mode 100644 llvm/test/CodeGen/AArch64/sve2p1-intrinsics-st1-single.ll
diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index b5baafedd139602..ff9a4062a35912c 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -298,6 +298,29 @@ let TargetGuard = "sve,bf16" in {
def SVBFMLALT_LANE : SInst<"svbfmlalt_lane[_{0}]", "MMddi", "b", MergeNone, "aarch64_sve_bfmlalt_lane_v2", [IsOverloadNone], [ImmCheck<3, ImmCheck0_7>]>;
}
+let TargetGuard = "sve2p1" in {
+ // Contiguous zero-extending load to quadword (single vector).
+ def SVLD1UWQ : MInst<"svld1uwq[_{d}]", "dPc", "iUif", [IsLoad], MemEltTyInt32, "aarch64_sve_ld1uwq">;
+ def SVLD1UWQ_VNUM : MInst<"svld1uwq_vnum[_{d}]", "dPcl", "iUif", [IsLoad], MemEltTyInt32, "aarch64_sve_ld1uwq">;
+
+ def SVLD1UDQ : MInst<"svld1udq[_{d}]", "dPc", "lUld", [IsLoad], MemEltTyInt64, "aarch64_sve_ld1udq">;
+ def SVLD1UDQ_VNUM : MInst<"svld1udq_vnum[_{d}]", "dPcl", "lUld", [IsLoad], MemEltTyInt64, "aarch64_sve_ld1udq">;
+
+ // Load one vector (vector base + scalar offset)
+ def SVLD1Q_GATHER_U64BASE_OFFSET : MInst<"svld1q_gather[_{2}base]_offset_{d}", "dPgl", "cUcsUsiUilUlfhdb", [IsGatherLoad, IsByteIndexed], MemEltTyDefault, "aarch64_sve_ld1q_gather_scalar_offset">;
+ def SVLD1Q_GATHER_U64BASE : MInst<"svld1q_gather[_{2}base]_{d}", "dPg", "cUcsUsiUilUlfhdb", [IsGatherLoad, IsByteIndexed], MemEltTyDefault, "aarch64_sve_ld1q_gather_scalar_offset">;
+
+ // Load N-element structure into N vectors (scalar base)
+ defm SVLD2Q : StructLoad<"svld2q[_{2}]", "2Pc", "aarch64_sve_ld2q_sret">;
+ defm SVLD3Q : StructLoad<"svld3q[_{2}]", "3Pc", "aarch64_sve_ld3q_sret">;
+ defm SVLD4Q : StructLoad<"svld4q[_{2}]", "4Pc", "aarch64_sve_ld4q_sret">;
+
+ // Load N-element structure into N vectors (scalar base, VL displacement)
+ defm SVLD2Q_VNUM : StructLoad<"svld2q_vnum[_{2}]", "2Pcl", "aarch64_sve_ld2q_sret">;
+ defm SVLD3Q_VNUM : StructLoad<"svld3q_vnum[_{2}]", "3Pcl", "aarch64_sve_ld3q_sret">;
+ defm SVLD4Q_VNUM : StructLoad<"svld4q_vnum[_{2}]", "4Pcl", "aarch64_sve_ld4q_sret">;
+}
+
////////////////////////////////////////////////////////////////////////////////
// Stores
@@ -420,6 +443,29 @@ let TargetGuard = "sve,bf16" in {
def SVSTNT1_VNUM_BF : MInst<"svstnt1_vnum[_{d}]", "vPpld", "b", [IsStore], MemEltTyDefault, "aarch64_sve_stnt1">;
}
+let TargetGuard = "sve2p1" in {
+ // Contiguous truncating store from quadword (single vector).
+ def SVST1UWQ : MInst<"svst1uwq[_{d}]", "vPcd", "iUif", [IsStore], MemEltTyInt32, "aarch64_sve_st1uwq">;
+ def SVST1UWQ_VNUM : MInst<"svst1uwq_vnum[_{d}]", "vPcld", "iUif", [IsStore], MemEltTyInt32, "aarch64_sve_st1uwq">;
+
+ def SVST1UDQ : MInst<"svst1udq[_{d}]", "vPcd", "lUld", [IsStore], MemEltTyInt64, "aarch64_sve_st1udq">;
+ def SVST1UDQ_VNUM : MInst<"svst1udq_vnum[_{d}]", "vPcld", "lUld", [IsStore], MemEltTyInt64, "aarch64_sve_st1udq">;
+
+ // Store one vector (vector base + scalar offset)
+ def SVST1Q_SCATTER_U64BASE_OFFSET : MInst<"svst1q_scatter[_{2}base]_offset[_{d}]", "vPgld", "cUcsUsiUilUlfhdb", [IsScatterStore, IsByteIndexed], MemEltTyDefault, "aarch64_sve_st1q_scatter_scalar_offset">;
+ def SVST1Q_SCATTER_U64BASE : MInst<"svst1q_scatter[_{2}base][_{d}]", "vPgd", "cUcsUsiUilUlfhdb", [IsScatterStore, IsByteIndexed], MemEltTyDefault, "aarch64_sve_st1q_scatter_scalar_offset">;
+
+ // Store N vectors into N-element structure (scalar base)
+ defm SVST2Q : StructStore<"svst2q[_{d}]", "vPc2", "aarch64_sve_st2q">;
+ defm SVST3Q : StructStore<"svst3q[_{d}]", "vPc3", "aarch64_sve_st3q">;
+ defm SVST4Q : StructStore<"svst4q[_{d}]", "vPc4", "aarch64_sve_st4q">;
+
+ // Store N vectors into N-element structure (scalar base, VL displacement)
+ defm SVST2Q_VNUM : StructStore<"svst2q_vnum[_{d}]", "vPcl2", "aarch64_sve_st2q">;
+ defm SVST3Q_VNUM : StructStore<"svst3q_vnum[_{d}]", "vPcl3", "aarch64_sve_st3q">;
+ defm SVST4Q_VNUM : StructStore<"svst4q_vnum[_{d}]", "vPcl4", "aarch64_sve_st4q">;
+}
+
////////////////////////////////////////////////////////////////////////////////
// Prefetches
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index dce5ee5888c458e..60bc84cb3602de9 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -9529,14 +9529,17 @@ Value *CodeGenFunction::EmitSVEStructLoad(const SVETypeFlags &TypeFlags,
case Intrinsic::aarch64_sve_ld2_sret:
case Intrinsic::aarch64_sve_ld1_pn_x2:
case Intrinsic::aarch64_sve_ldnt1_pn_x2:
+ case Intrinsic::aarch64_sve_ld2q_sret:
N = 2;
break;
case Intrinsic::aarch64_sve_ld3_sret:
+ case Intrinsic::aarch64_sve_ld3q_sret:
N = 3;
break;
case Intrinsic::aarch64_sve_ld4_sret:
case Intrinsic::aarch64_sve_ld1_pn_x4:
case Intrinsic::aarch64_sve_ldnt1_pn_x4:
+ case Intrinsic::aarch64_sve_ld4q_sret:
N = 4;
break;
default:
@@ -9574,14 +9577,17 @@ Value *CodeGenFunction::EmitSVEStructStore(const SVETypeFlags &TypeFlags,
case Intrinsic::aarch64_sve_st2:
case Intrinsic::aarch64_sve_st1_pn_x2:
case Intrinsic::aarch64_sve_stnt1_pn_x2:
+ case Intrinsic::aarch64_sve_st2q:
N = 2;
break;
case Intrinsic::aarch64_sve_st3:
+ case Intrinsic::aarch64_sve_st3q:
N = 3;
break;
case Intrinsic::aarch64_sve_st4:
case Intrinsic::aarch64_sve_st1_pn_x4:
case Intrinsic::aarch64_sve_stnt1_pn_x4:
+ case Intrinsic::aarch64_sve_st4q:
N = 4;
break;
default:
@@ -9662,7 +9668,7 @@ Value *CodeGenFunction::EmitSVEPrefetchLoad(const SVETypeFlags &TypeFlags,
Value *CodeGenFunction::EmitSVEMaskedLoad(const CallExpr *E,
llvm::Type *ReturnTy,
SmallVectorImpl<Value *> &Ops,
- unsigned BuiltinID,
+ unsigned IntrinsicID,
bool IsZExtReturn) {
QualType LangPTy = E->getArg(1)->getType();
llvm::Type *MemEltTy = CGM.getTypes().ConvertType(
@@ -9671,28 +9677,47 @@ Value *CodeGenFunction::EmitSVEMaskedLoad(const CallExpr *E,
// The vector type that is returned may be different from the
// eventual type loaded from memory.
auto VectorTy = cast<llvm::ScalableVectorType>(ReturnTy);
- auto MemoryTy = llvm::ScalableVectorType::get(MemEltTy, VectorTy);
+ llvm::ScalableVectorType *MemoryTy = nullptr;
+ llvm::ScalableVectorType *PredTy = nullptr;
+ bool IsExtendingLoad = true;
+ switch (IntrinsicID) {
+ case Intrinsic::aarch64_sve_ld1uwq:
+ case Intrinsic::aarch64_sve_ld1udq:
+ MemoryTy = llvm::ScalableVectorType::get(MemEltTy, 1);
+ PredTy =
+ llvm::ScalableVectorType::get(IntegerType::get(getLLVMContext(), 1), 1);
+ IsExtendingLoad = false;
+ break;
+ default:
+ MemoryTy = llvm::ScalableVectorType::get(MemEltTy, VectorTy);
+ PredTy = MemoryTy;
+ break;
+ }
- Value *Predicate = EmitSVEPredicateCast(Ops[0], MemoryTy);
+ Value *Predicate = EmitSVEPredicateCast(Ops[0], PredTy);
Value *BasePtr = Ops[1];
// Does the load have an offset?
if (Ops.size() > 2)
BasePtr = Builder.CreateGEP(MemoryTy, BasePtr, Ops[2]);
- Function *F = CGM.getIntrinsic(BuiltinID, MemoryTy);
+ Function *F =
+ CGM.getIntrinsic(IntrinsicID, IsExtendingLoad ? MemoryTy : VectorTy);
auto *Load =
cast<llvm::Instruction>(Builder.CreateCall(F, {Predicate, BasePtr}));
auto TBAAInfo = CGM.getTBAAAccessInfo(LangPTy->getPointeeType());
CGM.DecorateInstructionWithTBAA(Load, TBAAInfo);
+ if (!IsExtendingLoad)
+ return Load;
+
return IsZExtReturn ? Builder.CreateZExt(Load, VectorTy)
- : Builder.CreateSExt(Load, VectorTy);
+ : Builder.CreateSExt(Load, VectorTy);
}
Value *CodeGenFunction::EmitSVEMaskedStore(const CallExpr *E,
SmallVectorImpl<Value *> &Ops,
- unsigned BuiltinID) {
+ unsigned IntrinsicID) {
QualType LangPTy = E->getArg(1)->getType();
llvm::Type *MemEltTy = CGM.getTypes().ConvertType(
LangPTy->castAs<PointerType>()->getPointeeType());
@@ -9702,17 +9727,34 @@ Value *CodeGenFunction::EmitSVEMaskedStore(const CallExpr *E,
auto VectorTy = cast<llvm::ScalableVectorType>(Ops.back()->getType());
auto MemoryTy = llvm::ScalableVectorType::get(MemEltTy, VectorTy);
- Value *Predicate = EmitSVEPredicateCast(Ops[0], MemoryTy);
+ auto PredTy = MemoryTy;
+ auto AddrMemoryTy = MemoryTy;
+ bool IsTruncatingStore = true;
+ ;
+ switch (IntrinsicID) {
+ case Intrinsic::aarch64_sve_st1uwq:
+ case Intrinsic::aarch64_sve_st1udq:
+ AddrMemoryTy = llvm::ScalableVectorType::get(MemEltTy, 1);
+ PredTy =
+ llvm::ScalableVectorType::get(IntegerType::get(getLLVMContext(), 1), 1);
+ IsTruncatingStore = false;
+ break;
+ default:
+ break;
+ }
+ Value *Predicate = EmitSVEPredicateCast(Ops[0], PredTy);
Value *BasePtr = Ops[1];
// Does the store have an offset?
if (Ops.size() == 4)
- BasePtr = Builder.CreateGEP(MemoryTy, BasePtr, Ops[2]);
+ BasePtr = Builder.CreateGEP(AddrMemoryTy, BasePtr, Ops[2]);
// Last value is always the data
- llvm::Value *Val = Builder.CreateTrunc(Ops.back(), MemoryTy);
+ Value *Val = IsTruncatingStore ? Builder.CreateTrunc(Ops.back(), MemoryTy)
+ : Ops.back();
- Function *F = CGM.getIntrinsic(BuiltinID, MemoryTy);
+ Function *F =
+ CGM.getIntrinsic(IntrinsicID, IsTruncatingStore ? MemoryTy : VectorTy);
auto *Store =
cast<llvm::Instruction>(Builder.CreateCall(F, {Val, Predicate, BasePtr}));
auto TBAAInfo = CGM.getTBAAAccessInfo(LangPTy->getPointeeType());
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ld1_single.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ld1_single.c
new file mode 100644
index 000000000000000..16361ecc987d3c5
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ld1_single.c
@@ -0,0 +1,255 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 \
+// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 \
+// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 \
+// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 \
+// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 \
+// RUN: -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4
+#endif
+
+// LD1W
+
+// CHECK-LABEL: define dso_local <vscale x 4 x i32> @test_svld1uwq_u32
+// CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1uwq.nxv4i32(<vscale x 1 x i1> [[TMP0]], ptr [[BASE]])
+// CHECK-NEXT: ret <vscale x 4 x i32> [[TMP1]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 4 x i32> @_Z17test_svld1uwq_u32u10__SVBool_tPKj
+// CPP-CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0:[0-9]+]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1uwq.nxv4i32(<vscale x 1 x i1> [[TMP0]], ptr [[BASE]])
+// CPP-CHECK-NEXT: ret <vscale x 4 x i32> [[TMP1]]
+//
+svuint32_t test_svld1uwq_u32(svbool_t pred, uint32_t const * base) {
+ return SVE_ACLE_FUNC(svld1uwq, _u32, , )(pred, base);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x i32> @test_svld1uwq_vnum_u32
+// CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 1 x i32>, ptr [[BASE]], i64 -8
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1uwq.nxv4i32(<vscale x 1 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: ret <vscale x 4 x i32> [[TMP2]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 4 x i32> @_Z22test_svld1uwq_vnum_u32u10__SVBool_tPKj
+// CPP-CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 1 x i32>, ptr [[BASE]], i64 -8
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1uwq.nxv4i32(<vscale x 1 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: ret <vscale x 4 x i32> [[TMP2]]
+//
+svuint32_t test_svld1uwq_vnum_u32(svbool_t pred, uint32_t const * base) {
+ return SVE_ACLE_FUNC(svld1uwq_vnum, _u32, , )(pred, base, -8);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x i32> @test_svld1uwq_s32
+// CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1uwq.nxv4i32(<vscale x 1 x i1> [[TMP0]], ptr [[BASE]])
+// CHECK-NEXT: ret <vscale x 4 x i32> [[TMP1]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 4 x i32> @_Z17test_svld1uwq_s32u10__SVBool_tPKi
+// CPP-CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1uwq.nxv4i32(<vscale x 1 x i1> [[TMP0]], ptr [[BASE]])
+// CPP-CHECK-NEXT: ret <vscale x 4 x i32> [[TMP1]]
+//
+svint32_t test_svld1uwq_s32(svbool_t pred, int32_t const * base) {
+ return SVE_ACLE_FUNC(svld1uwq, _s32, , )(pred, base);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x i32> @test_svld1uwq_vnum_s32
+// CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 1 x i32>, ptr [[BASE]], i64 7
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1uwq.nxv4i32(<vscale x 1 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: ret <vscale x 4 x i32> [[TMP2]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 4 x i32> @_Z22test_svld1uwq_vnum_s32u10__SVBool_tPKi
+// CPP-CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 1 x i32>, ptr [[BASE]], i64 7
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1uwq.nxv4i32(<vscale x 1 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: ret <vscale x 4 x i32> [[TMP2]]
+//
+svint32_t test_svld1uwq_vnum_s32(svbool_t pred, int32_t const * base) {
+ return SVE_ACLE_FUNC(svld1uwq_vnum, _s32, , )(pred, base, 7);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_svld1uwq_f32
+// CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.ld1uwq.nxv4f32(<vscale x 1 x i1> [[TMP0]], ptr [[BASE]])
+// CHECK-NEXT: ret <vscale x 4 x float> [[TMP1]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 4 x float> @_Z17test_svld1uwq_f32u10__SVBool_tPKf
+// CPP-CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.ld1uwq.nxv4f32(<vscale x 1 x i1> [[TMP0]], ptr [[BASE]])
+// CPP-CHECK-NEXT: ret <vscale x 4 x float> [[TMP1]]
+//
+svfloat32_t test_svld1uwq_f32(svbool_t pred, float32_t const * base) {
+ return SVE_ACLE_FUNC(svld1uwq, _f32, , )(pred, base);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_svld1uwq_vnum_f32
+// CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 1 x float>, ptr [[BASE]], i64 -8
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.ld1uwq.nxv4f32(<vscale x 1 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: ret <vscale x 4 x float> [[TMP2]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 4 x float> @_Z22test_svld1uwq_vnum_f32u10__SVBool_tPKf
+// CPP-CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 1 x float>, ptr [[BASE]], i64 -8
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.ld1uwq.nxv4f32(<vscale x 1 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: ret <vscale x 4 x float> [[TMP2]]
+//
+svfloat32_t test_svld1uwq_vnum_f32(svbool_t pred, float32_t const * base) {
+ return SVE_ACLE_FUNC(svld1uwq_vnum, _f32, , )(pred, base, -8);
+}
+
+
+// LD1D
+
+// CHECK-LABEL: define dso_local <vscale x 2 x i64> @test_svld1udq_u64
+// CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1udq.nxv2i64(<vscale x 1 x i1> [[TMP0]], ptr [[BASE]])
+// CHECK-NEXT: ret <vscale x 2 x i64> [[TMP1]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 2 x i64> @_Z17test_svld1udq_u64u10__SVBool_tPKm
+// CPP-CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1udq.nxv2i64(<vscale x 1 x i1> [[TMP0]], ptr [[BASE]])
+// CPP-CHECK-NEXT: ret <vscale x 2 x i64> [[TMP1]]
+//
+svuint64_t test_svld1udq_u64(svbool_t pred, uint64_t const * base) {
+ return SVE_ACLE_FUNC(svld1udq, _u64, , )(pred, base);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 2 x i64> @test_svld1udq_vnum_u64
+// CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 1 x i64>, ptr [[BASE]], i64 7
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1udq.nxv2i64(<vscale x 1 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: ret <vscale x 2 x i64> [[TMP2]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 2 x i64> @_Z22test_svld1udq_vnum_u64u10__SVBool_tPKm
+// CPP-CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 1 x i64>, ptr [[BASE]], i64 7
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1udq.nxv2i64(<vscale x 1 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: ret <vscale x 2 x i64> [[TMP2]]
+//
+svuint64_t test_svld1udq_vnum_u64(svbool_t pred, uint64_t const * base) {
+ return SVE_ACLE_FUNC(svld1udq_vnum, _u64, , )(pred, base, 7);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 2 x i64> @test_svld1udq_s64
+// CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1udq.nxv2i64(<vscale x 1 x i1> [[TMP0]], ptr [[BASE]])
+// CHECK-NEXT: ret <vscale x 2 x i64> [[TMP1]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 2 x i64> @_Z17test_svld1udq_s64u10__SVBool_tPKl
+// CPP-CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1udq.nxv2i64(<vscale x 1 x i1> [[TMP0]], ptr [[BASE]])
+// CPP-CHECK-NEXT: ret <vscale x 2 x i64> [[TMP1]]
+//
+svint64_t test_svld1udq_s64(svbool_t pred, int64_t const * base) {
+ return SVE_ACLE_FUNC(svld1udq, _s64, , )(pred, base);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 2 x i64> @test_svld1udq_vnum_s64
+// CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 1 x i64>, ptr [[BASE]], i64 -8
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1udq.nxv2i64(<vscale x 1 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: ret <vscale x 2 x i64> [[TMP2]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 2 x i64> @_Z22test_svld1udq_vnum_s64u10__SVBool_tPKl
+// CPP-CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 1 x i64>, ptr [[BASE]], i64 -8
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1udq.nxv2i64(<vscale x 1 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: ret <vscale x 2 x i64> [[TMP2]]
+//
+svint64_t test_svld1udq_vnum_s64(svbool_t pred, int64_t const * base) {
+ return SVE_ACLE_FUNC(svld1udq_vnum, _s64, , )(pred, base, -8);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 2 x double> @test_svld1udq_f64
+// CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.ld1udq.nxv2f64(<vscale x 1 x i1> [[TMP0]], ptr [[BASE]])
+// CHECK-NEXT: ret <vscale x 2 x double> [[TMP1]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 2 x double> @_Z17test_svld1udq_f64u10__SVBool_tPKd
+// CPP-CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.ld1udq.nxv2f64(<vscale x 1 x i1> [[TMP0]], ptr [[BASE]])
+// CPP-CHECK-NEXT: ret <vscale x 2 x double> [[TMP1]]
+//
+svfloat64_t test_svld1udq_f64(svbool_t pred, float64_t const * base) {
+ return SVE_ACLE_FUNC(svld1udq, _f64, , )(pred, base);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 2 x double> @test_svld1udq_vnum_f64
+// CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 1 x double>, ptr [[BASE]], i64 7
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.ld1udq.nxv2f64(<vscale x 1 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: ret <vscale x 2 x double> [[TMP2]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 2 x double> @_Z22test_svld1udq_vnum_f64u10__SVBool_tPKd
+// CPP-CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 1 x double>, ptr [[BASE]], i64 7
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.ld1udq.nxv2f64(<vscale x 1 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: ret <vscale x 2 x double> [[TMP2]]
+//
+svfloat64_t test_svld1udq_vnum_f64(svbool_t pred, float64_t const * base) {
+ return SVE_ACLE_FUNC(svld1udq_vnum, _f64, , )(pred, base, 7);
+}
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_loads.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_loads.c
new file mode 100644
index 000000000000000..65d84219e7eb0d8
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_loads.c
@@ -0,0 +1,2495 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
+#endif
+
+// CHECK-LABEL: @test_svld2q_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld2q.sret.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], ptr [[BASE:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CHECK-NEXT: ret <vscale x 32 x i8> [[TMP4]]
+//
+// CPP-CHECK-LABEL: @_Z14test_svld2q_u8u10__SVBool_tPKh(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld2q.sret.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CPP-CHECK-NEXT: ret <vscale x 32 x i8> [[TMP4]]
+//
+svuint8x2_t test_svld2q_u8(svbool_t pg, const uint8_t *base)
+{
+ return SVE_ACLE_FUNC(svld2q,,_u8,)(pg, base);
+}
+
+// CHECK-LABEL: @test_svld2q_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld2q.sret.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], ptr [[BASE:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CHECK-NEXT: ret <vscale x 32 x i8> [[TMP4]]
+//
+// CPP-CHECK-LABEL: @_Z14test_svld2q_s8u10__SVBool_tPKa(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld2q.sret.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CPP-CHECK-NEXT: ret <vscale x 32 x i8> [[TMP4]]
+//
+svint8x2_t test_svld2q_s8(svbool_t pg, const int8_t *base)
+{
+ return SVE_ACLE_FUNC(svld2q,,_s8,)(pg, base);
+}
+// CHECK-LABEL: @test_svld2q_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld2q.sret.nxv8i16(<vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 0
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP2]], i64 0)
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i64 8)
+// CHECK-NEXT: ret <vscale x 16 x i16> [[TMP5]]
+//
+// CPP-CHECK-LABEL: @_Z15test_svld2q_u16u10__SVBool_tPKt(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld2q.sret.nxv8i16(<vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 0
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 1
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i64 8)
+// CPP-CHECK-NEXT: ret <vscale x 16 x i16> [[TMP5]]
+//
+svuint16x2_t test_svld2q_u16(svbool_t pg, const uint16_t *base)
+{
+ return SVE_ACLE_FUNC(svld2q,,_u16,)(pg, base);
+}
+
+// CHECK-LABEL: @test_svld2q_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld2q.sret.nxv8i16(<vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 0
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP2]], i64 0)
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i64 8)
+// CHECK-NEXT: ret <vscale x 16 x i16> [[TMP5]]
+//
+// CPP-CHECK-LABEL: @_Z15test_svld2q_s16u10__SVBool_tPKs(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld2q.sret.nxv8i16(<vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 0
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 1
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i64 8)
+// CPP-CHECK-NEXT: ret <vscale x 16 x i16> [[TMP5]]
+//
+svint16x2_t test_svld2q_s16(svbool_t pg, const int16_t *base)
+{
+ return SVE_ACLE_FUNC(svld2q,,_s16,)(pg, base);
+}
+
+// CHECK-LABEL: @test_svld2q_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2q.sret.nxv4i32(<vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 0
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP2]], i64 0)
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], i64 4)
+// CHECK-NEXT: ret <vscale x 8 x i32> [[TMP5]]
+//
+// CPP-CHECK-LABEL: @_Z15test_svld2q_u32u10__SVBool_tPKj(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2q.sret.nxv4i32(<vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 0
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 1
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], i64 4)
+// CPP-CHECK-NEXT: ret <vscale x 8 x i32> [[TMP5]]
+//
+svuint32x2_t test_svld2q_u32(svbool_t pg, const uint32_t *base)
+{
+ return SVE_ACLE_FUNC(svld2q,,_u32,)(pg, base);
+}
+
+// CHECK-LABEL: @test_svld2q_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2q.sret.nxv4i32(<vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 0
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP2]], i64 0)
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], i64 4)
+// CHECK-NEXT: ret <vscale x 8 x i32> [[TMP5]]
+//
+// CPP-CHECK-LABEL: @_Z15test_svld2q_s32u10__SVBool_tPKi(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2q.sret.nxv4i32(<vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 0
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 1
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], i64 4)
+// CPP-CHECK-NEXT: ret <vscale x 8 x i32> [[TMP5]]
+//
+svint32x2_t test_svld2q_s32(svbool_t pg, const int32_t *base)
+{
+ return SVE_ACLE_FUNC(svld2q,,_s32,)(pg, base);
+}
+
+// CHECK-LABEL: @test_svld2q_u64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2q.sret.nxv2i64(<vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 0
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP2]], i64 0)
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], i64 2)
+// CHECK-NEXT: ret <vscale x 4 x i64> [[TMP5]]
+//
+// CPP-CHECK-LABEL: @_Z15test_svld2q_u64u10__SVBool_tPKm(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2q.sret.nxv2i64(<vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 0
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 1
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], i64 2)
+// CPP-CHECK-NEXT: ret <vscale x 4 x i64> [[TMP5]]
+//
+svuint64x2_t test_svld2q_u64(svbool_t pg, const uint64_t *base)
+{
+ return SVE_ACLE_FUNC(svld2q,,_u64,)(pg, base);
+}
+
+// CHECK-LABEL: @test_svld2q_s64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2q.sret.nxv2i64(<vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 0
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP2]], i64 0)
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], i64 2)
+// CHECK-NEXT: ret <vscale x 4 x i64> [[TMP5]]
+//
+// CPP-CHECK-LABEL: @_Z15test_svld2q_s64u10__SVBool_tPKl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2q.sret.nxv2i64(<vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 0
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 1
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], i64 2)
+// CPP-CHECK-NEXT: ret <vscale x 4 x i64> [[TMP5]]
+//
+svint64x2_t test_svld2q_s64(svbool_t pg, const int64_t *base)
+{
+ return SVE_ACLE_FUNC(svld2q,,_s64,)(pg, base);
+}
+
+// CHECK-LABEL: @test_svld2q_f16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld2q.sret.nxv8f16(<vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 0
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP2]], i64 0)
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], i64 8)
+// CHECK-NEXT: ret <vscale x 16 x half> [[TMP5]]
+//
+// CPP-CHECK-LABEL: @_Z15test_svld2q_f16u10__SVBool_tPKDh(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld2q.sret.nxv8f16(<vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 0
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 1
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], i64 8)
+// CPP-CHECK-NEXT: ret <vscale x 16 x half> [[TMP5]]
+//
+svfloat16x2_t test_svld2q_f16(svbool_t pg, const float16_t *base)
+{
+ return SVE_ACLE_FUNC(svld2q,,_f16,)(pg, base);
+}
+
+// CHECK-LABEL: @test_svld2q_bf16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld2q.sret.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP2]], i64 0)
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], i64 8)
+// CHECK-NEXT: ret <vscale x 16 x bfloat> [[TMP5]]
+//
+// CPP-CHECK-LABEL: @_Z16test_svld2q_bf16u10__SVBool_tPKu6__bf16(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld2q.sret.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], i64 8)
+// CPP-CHECK-NEXT: ret <vscale x 16 x bfloat> [[TMP5]]
+//
+svbfloat16x2_t test_svld2q_bf16(svbool_t pg, const bfloat16_t *base)
+{
+ return SVE_ACLE_FUNC(svld2q,,_bf16,)(pg, base);
+}
+
+// CHECK-LABEL: @test_svld2q_f32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld2q.sret.nxv4f32(<vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 0
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP2]], i64 0)
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], i64 4)
+// CHECK-NEXT: ret <vscale x 8 x float> [[TMP5]]
+//
+// CPP-CHECK-LABEL: @_Z15test_svld2q_f32u10__SVBool_tPKf(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld2q.sret.nxv4f32(<vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 0
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 1
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], i64 4)
+// CPP-CHECK-NEXT: ret <vscale x 8 x float> [[TMP5]]
+//
+svfloat32x2_t test_svld2q_f32(svbool_t pg, const float32_t *base)
+{
+ return SVE_ACLE_FUNC(svld2q,,_f32,)(pg, base);
+}
+
+// CHECK-LABEL: @test_svld2q_f64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld2q.sret.nxv2f64(<vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 0
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP2]], i64 0)
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], i64 2)
+// CHECK-NEXT: ret <vscale x 4 x double> [[TMP5]]
+//
+// CPP-CHECK-LABEL: @_Z15test_svld2q_f64u10__SVBool_tPKd(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld2q.sret.nxv2f64(<vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 0
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 1
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], i64 2)
+// CPP-CHECK-NEXT: ret <vscale x 4 x double> [[TMP5]]
+//
+svfloat64x2_t test_svld2q_f64(svbool_t pg, const float64_t *base)
+{
+ return SVE_ACLE_FUNC(svld2q,,_f64,)(pg, base);
+}
+
+// CHECK-LABEL: @test_svld2q_vnum_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld2q.sret.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP2]], i64 0)
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i64 16)
+// CHECK-NEXT: ret <vscale x 32 x i8> [[TMP5]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svld2q_vnum_u8u10__SVBool_tPKhl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld2q.sret.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i64 16)
+// CPP-CHECK-NEXT: ret <vscale x 32 x i8> [[TMP5]]
+//
+svuint8x2_t test_svld2q_vnum_u8(svbool_t pg, const uint8_t *base, int64_t vnum)
+{
+ return SVE_ACLE_FUNC(svld2q_vnum_,,u8,)(pg, base, vnum);
+}
+
+// CHECK-LABEL: @test_svld2q_vnum_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld2q.sret.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP2]], i64 0)
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i64 16)
+// CHECK-NEXT: ret <vscale x 32 x i8> [[TMP5]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svld2q_vnum_s8u10__SVBool_tPKal(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld2q.sret.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i64 16)
+// CPP-CHECK-NEXT: ret <vscale x 32 x i8> [[TMP5]]
+//
+svint8x2_t test_svld2q_vnum_s8(svbool_t pg, const int8_t *base, int64_t vnum)
+{
+ return SVE_ACLE_FUNC(svld2q_vnum,,_s8,)(pg, base, vnum);
+}
+// CHECK-LABEL: @test_svld2q_vnum_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld2q.sret.nxv8i16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 0
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP3]], i64 0)
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 1
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 8)
+// CHECK-NEXT: ret <vscale x 16 x i16> [[TMP6]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svld2q_vnum_u16u10__SVBool_tPKtl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld2q.sret.nxv8i16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 0
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP3]], i64 0)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 1
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 8)
+// CPP-CHECK-NEXT: ret <vscale x 16 x i16> [[TMP6]]
+//
+svuint16x2_t test_svld2q_vnum_u16(svbool_t pg, const uint16_t *base, int64_t vnum)
+{
+ return SVE_ACLE_FUNC(svld2q_vnum,,_u16,)(pg, base, vnum);
+}
+
+// CHECK-LABEL: @test_svld2q_vnum_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld2q.sret.nxv8i16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 0
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP3]], i64 0)
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 1
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 8)
+// CHECK-NEXT: ret <vscale x 16 x i16> [[TMP6]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svld2q_vnum_s16u10__SVBool_tPKsl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld2q.sret.nxv8i16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 0
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP3]], i64 0)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 1
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 8)
+// CPP-CHECK-NEXT: ret <vscale x 16 x i16> [[TMP6]]
+//
+svint16x2_t test_svld2q_vnum_s16(svbool_t pg, const int16_t *base, int64_t vnum)
+{
+ return SVE_ACLE_FUNC(svld2q_vnum,,_s16,)(pg, base, vnum);
+}
+
+// CHECK-LABEL: @test_svld2q_vnum_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2q.sret.nxv4i32(<vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 0
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP3]], i64 0)
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 1
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 4)
+// CHECK-NEXT: ret <vscale x 8 x i32> [[TMP6]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svld2q_vnum_u32u10__SVBool_tPKjl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2q.sret.nxv4i32(<vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 0
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP3]], i64 0)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 1
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 4)
+// CPP-CHECK-NEXT: ret <vscale x 8 x i32> [[TMP6]]
+//
+svuint32x2_t test_svld2q_vnum_u32(svbool_t pg, const uint32_t *base, int64_t vnum)
+{
+ return SVE_ACLE_FUNC(svld2q_vnum,,_u32,)(pg, base, vnum);
+}
+
+// CHECK-LABEL: @test_svld2q_vnum_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2q.sret.nxv4i32(<vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 0
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP3]], i64 0)
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 1
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 4)
+// CHECK-NEXT: ret <vscale x 8 x i32> [[TMP6]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svld2q_vnum_s32u10__SVBool_tPKil(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2q.sret.nxv4i32(<vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 0
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP3]], i64 0)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 1
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 4)
+// CPP-CHECK-NEXT: ret <vscale x 8 x i32> [[TMP6]]
+//
+svint32x2_t test_svld2q_vnum_s32(svbool_t pg, const int32_t *base, int64_t vnum)
+{
+ return SVE_ACLE_FUNC(svld2q_vnum,,_s32,)(pg, base, vnum);
+}
+
+// CHECK-LABEL: @test_svld2q_vnum_u64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2q.sret.nxv2i64(<vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 0
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP3]], i64 0)
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 1
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 2)
+// CHECK-NEXT: ret <vscale x 4 x i64> [[TMP6]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svld2q_vnum_u64u10__SVBool_tPKml(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2q.sret.nxv2i64(<vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 0
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP3]], i64 0)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 1
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 2)
+// CPP-CHECK-NEXT: ret <vscale x 4 x i64> [[TMP6]]
+//
+svuint64x2_t test_svld2q_vnum_u64(svbool_t pg, const uint64_t *base, int64_t vnum)
+{
+ return SVE_ACLE_FUNC(svld2q_vnum,,_u64,)(pg, base, vnum);
+}
+
+// CHECK-LABEL: @test_svld2q_vnum_s64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2q.sret.nxv2i64(<vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 0
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP3]], i64 0)
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 1
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 2)
+// CHECK-NEXT: ret <vscale x 4 x i64> [[TMP6]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svld2q_vnum_s64u10__SVBool_tPKll(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2q.sret.nxv2i64(<vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 0
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP3]], i64 0)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 1
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 2)
+// CPP-CHECK-NEXT: ret <vscale x 4 x i64> [[TMP6]]
+//
+svint64x2_t test_svld2q_vnum_s64(svbool_t pg, const int64_t *base, int64_t vnum)
+{
+ return SVE_ACLE_FUNC(svld2q_vnum,,_s64,)(pg, base, vnum);
+}
+
+// CHECK-LABEL: @test_svld2q_vnum_f16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld2q.sret.nxv8f16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], 0
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP3]], i64 0)
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], 1
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 8)
+// CHECK-NEXT: ret <vscale x 16 x half> [[TMP6]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svld2q_vnum_f16u10__SVBool_tPKDhl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld2q.sret.nxv8f16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], 0
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP3]], i64 0)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], 1
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 8)
+// CPP-CHECK-NEXT: ret <vscale x 16 x half> [[TMP6]]
+//
+svfloat16x2_t test_svld2q_vnum_f16(svbool_t pg, const float16_t *base, int64_t vnum)
+{
+ return SVE_ACLE_FUNC(svld2q_vnum,,_f16,)(pg, base, vnum);
+}
+
+// CHECK-LABEL: @test_svld2q_vnum_bf16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld2q.sret.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP3]], i64 0)
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 8)
+// CHECK-NEXT: ret <vscale x 16 x bfloat> [[TMP6]]
+//
+// CPP-CHECK-LABEL: @_Z21test_svld2q_vnum_bf16u10__SVBool_tPKu6__bf16l(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld2q.sret.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP3]], i64 0)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 8)
+// CPP-CHECK-NEXT: ret <vscale x 16 x bfloat> [[TMP6]]
+//
+svbfloat16x2_t test_svld2q_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum)
+{
+ return SVE_ACLE_FUNC(svld2q_vnum,,_bf16,)(pg, base, vnum);
+}
+
+// CHECK-LABEL: @test_svld2q_vnum_f32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld2q.sret.nxv4f32(<vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 0
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP3]], i64 0)
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 1
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 4)
+// CHECK-NEXT: ret <vscale x 8 x float> [[TMP6]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svld2q_vnum_f32u10__SVBool_tPKfl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld2q.sret.nxv4f32(<vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 0
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP3]], i64 0)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 1
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 4)
+// CPP-CHECK-NEXT: ret <vscale x 8 x float> [[TMP6]]
+//
+svfloat32x2_t test_svld2q_vnum_f32(svbool_t pg, const float32_t *base, int64_t vnum)
+{
+ return SVE_ACLE_FUNC(svld2q_vnum,,_f32,)(pg, base, vnum);
+}
+
+// CHECK-LABEL: @test_svld2q_vnum_f64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld2q.sret.nxv2f64(<vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], 0
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP3]], i64 0)
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], 1
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 2)
+// CHECK-NEXT: ret <vscale x 4 x double> [[TMP6]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svld2q_vnum_f64u10__SVBool_tPKdl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld2q.sret.nxv2f64(<vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], 0
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP3]], i64 0)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], 1
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 2)
+// CPP-CHECK-NEXT: ret <vscale x 4 x double> [[TMP6]]
+//
+svfloat64x2_t test_svld2q_vnum_f64(svbool_t pg, const float64_t *base, int64_t vnum)
+{
+ return SVE_ACLE_FUNC(svld2q_vnum,,_f64,)(pg, base, vnum);
+}
+
+// CHECK-LABEL: @test_svld3q_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld3q.sret.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], ptr [[BASE:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 48 x i8> @llvm.vector.insert.nxv48i8.nxv16i8(<vscale x 48 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 48 x i8> @llvm.vector.insert.nxv48i8.nxv16i8(<vscale x 48 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 48 x i8> @llvm.vector.insert.nxv48i8.nxv16i8(<vscale x 48 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
+// CHECK-NEXT: ret <vscale x 48 x i8> [[TMP6]]
+//
+// CPP-CHECK-LABEL: @_Z14test_svld3q_u8u10__SVBool_tPKh(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld3q.sret.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 48 x i8> @llvm.vector.insert.nxv48i8.nxv16i8(<vscale x 48 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 48 x i8> @llvm.vector.insert.nxv48i8.nxv16i8(<vscale x 48 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 48 x i8> @llvm.vector.insert.nxv48i8.nxv16i8(<vscale x 48 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
+// CPP-CHECK-NEXT: ret <vscale x 48 x i8> [[TMP6]]
+//
+svuint8x3_t test_svld3q_u8(svbool_t pg, const uint8_t *base)
+{
+ return SVE_ACLE_FUNC(svld3q,,_u8,)(pg, base);
+}
+
+// CHECK-LABEL: @test_svld3q_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld3q.sret.nxv8i16(<vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 0
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 24 x i16> @llvm.vector.insert.nxv24i16.nxv8i16(<vscale x 24 x i16> poison, <vscale x 8 x i16> [[TMP2]], i64 0)
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 24 x i16> @llvm.vector.insert.nxv24i16.nxv8i16(<vscale x 24 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i64 8)
+// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 2
+// CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 24 x i16> @llvm.vector.insert.nxv24i16.nxv8i16(<vscale x 24 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], i64 16)
+// CHECK-NEXT: ret <vscale x 24 x i16> [[TMP7]]
+//
+// CPP-CHECK-LABEL: @_Z15test_svld3q_u16u10__SVBool_tPKt(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld3q.sret.nxv8i16(<vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 0
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 24 x i16> @llvm.vector.insert.nxv24i16.nxv8i16(<vscale x 24 x i16> poison, <vscale x 8 x i16> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 1
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 24 x i16> @llvm.vector.insert.nxv24i16.nxv8i16(<vscale x 24 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i64 8)
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 2
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 24 x i16> @llvm.vector.insert.nxv24i16.nxv8i16(<vscale x 24 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], i64 16)
+// CPP-CHECK-NEXT: ret <vscale x 24 x i16> [[TMP7]]
+//
+svuint16x3_t test_svld3q_u16(svbool_t pg, const uint16_t *base)
+{
+ return SVE_ACLE_FUNC(svld3q,,_u16,)(pg, base);
+}
+
+// CHECK-LABEL: @test_svld3q_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld3q.sret.nxv8i16(<vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 0
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 24 x i16> @llvm.vector.insert.nxv24i16.nxv8i16(<vscale x 24 x i16> poison, <vscale x 8 x i16> [[TMP2]], i64 0)
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 24 x i16> @llvm.vector.insert.nxv24i16.nxv8i16(<vscale x 24 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i64 8)
+// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 2
+// CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 24 x i16> @llvm.vector.insert.nxv24i16.nxv8i16(<vscale x 24 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], i64 16)
+// CHECK-NEXT: ret <vscale x 24 x i16> [[TMP7]]
+//
+// CPP-CHECK-LABEL: @_Z15test_svld3q_s16u10__SVBool_tPKs(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld3q.sret.nxv8i16(<vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 0
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 24 x i16> @llvm.vector.insert.nxv24i16.nxv8i16(<vscale x 24 x i16> poison, <vscale x 8 x i16> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 1
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 24 x i16> @llvm.vector.insert.nxv24i16.nxv8i16(<vscale x 24 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i64 8)
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 2
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 24 x i16> @llvm.vector.insert.nxv24i16.nxv8i16(<vscale x 24 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], i64 16)
+// CPP-CHECK-NEXT: ret <vscale x 24 x i16> [[TMP7]]
+//
+svint16x3_t test_svld3q_s16(svbool_t pg, const int16_t *base)
+{
+ return SVE_ACLE_FUNC(svld3q,,_s16,)(pg, base);
+}
+
+// CHECK-LABEL: @test_svld3q_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld3q.sret.nxv4i32(<vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 0
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 12 x i32> @llvm.vector.insert.nxv12i32.nxv4i32(<vscale x 12 x i32> poison, <vscale x 4 x i32> [[TMP2]], i64 0)
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 12 x i32> @llvm.vector.insert.nxv12i32.nxv4i32(<vscale x 12 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], i64 4)
+// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 2
+// CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 12 x i32> @llvm.vector.insert.nxv12i32.nxv4i32(<vscale x 12 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP6]], i64 8)
+// CHECK-NEXT: ret <vscale x 12 x i32> [[TMP7]]
+//
+// CPP-CHECK-LABEL: @_Z15test_svld3q_u32u10__SVBool_tPKj(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld3q.sret.nxv4i32(<vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 0
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 12 x i32> @llvm.vector.insert.nxv12i32.nxv4i32(<vscale x 12 x i32> poison, <vscale x 4 x i32> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 1
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 12 x i32> @llvm.vector.insert.nxv12i32.nxv4i32(<vscale x 12 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], i64 4)
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 2
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 12 x i32> @llvm.vector.insert.nxv12i32.nxv4i32(<vscale x 12 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP6]], i64 8)
+// CPP-CHECK-NEXT: ret <vscale x 12 x i32> [[TMP7]]
+//
+svuint32x3_t test_svld3q_u32(svbool_t pg, const uint32_t *base)
+{
+ return SVE_ACLE_FUNC(svld3q,,_u32,)(pg, base);
+}
+
+// CHECK-LABEL: @test_svld3q_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld3q.sret.nxv4i32(<vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 0
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 12 x i32> @llvm.vector.insert.nxv12i32.nxv4i32(<vscale x 12 x i32> poison, <vscale x 4 x i32> [[TMP2]], i64 0)
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 12 x i32> @llvm.vector.insert.nxv12i32.nxv4i32(<vscale x 12 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], i64 4)
+// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 2
+// CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 12 x i32> @llvm.vector.insert.nxv12i32.nxv4i32(<vscale x 12 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP6]], i64 8)
+// CHECK-NEXT: ret <vscale x 12 x i32> [[TMP7]]
+//
+// CPP-CHECK-LABEL: @_Z15test_svld3q_s32u10__SVBool_tPKi(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld3q.sret.nxv4i32(<vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 0
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 12 x i32> @llvm.vector.insert.nxv12i32.nxv4i32(<vscale x 12 x i32> poison, <vscale x 4 x i32> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 1
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 12 x i32> @llvm.vector.insert.nxv12i32.nxv4i32(<vscale x 12 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], i64 4)
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 2
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 12 x i32> @llvm.vector.insert.nxv12i32.nxv4i32(<vscale x 12 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP6]], i64 8)
+// CPP-CHECK-NEXT: ret <vscale x 12 x i32> [[TMP7]]
+//
+svint32x3_t test_svld3q_s32(svbool_t pg, const int32_t *base)
+{
+ return SVE_ACLE_FUNC(svld3q,,_s32,)(pg, base);
+}
+
+// CHECK-LABEL: @test_svld3q_u64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3q.sret.nxv2i64(<vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 0
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 6 x i64> @llvm.vector.insert.nxv6i64.nxv2i64(<vscale x 6 x i64> poison, <vscale x 2 x i64> [[TMP2]], i64 0)
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 6 x i64> @llvm.vector.insert.nxv6i64.nxv2i64(<vscale x 6 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], i64 2)
+// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 2
+// CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 6 x i64> @llvm.vector.insert.nxv6i64.nxv2i64(<vscale x 6 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP6]], i64 4)
+// CHECK-NEXT: ret <vscale x 6 x i64> [[TMP7]]
+//
+// CPP-CHECK-LABEL: @_Z15test_svld3q_u64u10__SVBool_tPKm(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3q.sret.nxv2i64(<vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 0
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 6 x i64> @llvm.vector.insert.nxv6i64.nxv2i64(<vscale x 6 x i64> poison, <vscale x 2 x i64> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 1
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 6 x i64> @llvm.vector.insert.nxv6i64.nxv2i64(<vscale x 6 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], i64 2)
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 2
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 6 x i64> @llvm.vector.insert.nxv6i64.nxv2i64(<vscale x 6 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP6]], i64 4)
+// CPP-CHECK-NEXT: ret <vscale x 6 x i64> [[TMP7]]
+//
+svuint64x3_t test_svld3q_u64(svbool_t pg, const uint64_t *base)
+{
+ return SVE_ACLE_FUNC(svld3q,,_u64,)(pg, base);
+}
+
+// CHECK-LABEL: @test_svld3q_s64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3q.sret.nxv2i64(<vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 0
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 6 x i64> @llvm.vector.insert.nxv6i64.nxv2i64(<vscale x 6 x i64> poison, <vscale x 2 x i64> [[TMP2]], i64 0)
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 6 x i64> @llvm.vector.insert.nxv6i64.nxv2i64(<vscale x 6 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], i64 2)
+// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 2
+// CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 6 x i64> @llvm.vector.insert.nxv6i64.nxv2i64(<vscale x 6 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP6]], i64 4)
+// CHECK-NEXT: ret <vscale x 6 x i64> [[TMP7]]
+//
+// CPP-CHECK-LABEL: @_Z15test_svld3q_s64u10__SVBool_tPKl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3q.sret.nxv2i64(<vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 0
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 6 x i64> @llvm.vector.insert.nxv6i64.nxv2i64(<vscale x 6 x i64> poison, <vscale x 2 x i64> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 1
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 6 x i64> @llvm.vector.insert.nxv6i64.nxv2i64(<vscale x 6 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], i64 2)
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 2
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 6 x i64> @llvm.vector.insert.nxv6i64.nxv2i64(<vscale x 6 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP6]], i64 4)
+// CPP-CHECK-NEXT: ret <vscale x 6 x i64> [[TMP7]]
+//
+svint64x3_t test_svld3q_s64(svbool_t pg, const int64_t *base)
+{
+ return SVE_ACLE_FUNC(svld3q,,_s64,)(pg, base);
+}
+
+// CHECK-LABEL: @test_svld3q_f16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld3q.sret.nxv8f16(<vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 0
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 24 x half> @llvm.vector.insert.nxv24f16.nxv8f16(<vscale x 24 x half> poison, <vscale x 8 x half> [[TMP2]], i64 0)
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 24 x half> @llvm.vector.insert.nxv24f16.nxv8f16(<vscale x 24 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], i64 8)
+// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 2
+// CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 24 x half> @llvm.vector.insert.nxv24f16.nxv8f16(<vscale x 24 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], i64 16)
+// CHECK-NEXT: ret <vscale x 24 x half> [[TMP7]]
+//
+// CPP-CHECK-LABEL: @_Z15test_svld3q_f16u10__SVBool_tPKDh(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld3q.sret.nxv8f16(<vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 0
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 24 x half> @llvm.vector.insert.nxv24f16.nxv8f16(<vscale x 24 x half> poison, <vscale x 8 x half> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 1
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 24 x half> @llvm.vector.insert.nxv24f16.nxv8f16(<vscale x 24 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], i64 8)
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 2
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 24 x half> @llvm.vector.insert.nxv24f16.nxv8f16(<vscale x 24 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], i64 16)
+// CPP-CHECK-NEXT: ret <vscale x 24 x half> [[TMP7]]
+//
+svfloat16x3_t test_svld3q_f16(svbool_t pg, const float16_t *base)
+{
+ return SVE_ACLE_FUNC(svld3q,,_f16,)(pg, base);
+}
+
+// CHECK-LABEL: @test_svld3q_bf16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld3q.sret.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 24 x bfloat> @llvm.vector.insert.nxv24bf16.nxv8bf16(<vscale x 24 x bfloat> poison, <vscale x 8 x bfloat> [[TMP2]], i64 0)
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 24 x bfloat> @llvm.vector.insert.nxv24bf16.nxv8bf16(<vscale x 24 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], i64 8)
+// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 2
+// CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 24 x bfloat> @llvm.vector.insert.nxv24bf16.nxv8bf16(<vscale x 24 x bfloat> [[TMP5]], <vscale x 8 x bfloat> [[TMP6]], i64 16)
+// CHECK-NEXT: ret <vscale x 24 x bfloat> [[TMP7]]
+//
+// CPP-CHECK-LABEL: @_Z16test_svld3q_bf16u10__SVBool_tPKu6__bf16(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld3q.sret.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 24 x bfloat> @llvm.vector.insert.nxv24bf16.nxv8bf16(<vscale x 24 x bfloat> poison, <vscale x 8 x bfloat> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 24 x bfloat> @llvm.vector.insert.nxv24bf16.nxv8bf16(<vscale x 24 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], i64 8)
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 2
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 24 x bfloat> @llvm.vector.insert.nxv24bf16.nxv8bf16(<vscale x 24 x bfloat> [[TMP5]], <vscale x 8 x bfloat> [[TMP6]], i64 16)
+// CPP-CHECK-NEXT: ret <vscale x 24 x bfloat> [[TMP7]]
+//
+svbfloat16x3_t test_svld3q_bf16(svbool_t pg, const bfloat16_t *base)
+{
+ return SVE_ACLE_FUNC(svld3q,,_bf16,)(pg, base);
+}
+
+// CHECK-LABEL: @test_svld3q_f32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld3q.sret.nxv4f32(<vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 0
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 12 x float> @llvm.vector.insert.nxv12f32.nxv4f32(<vscale x 12 x float> poison, <vscale x 4 x float> [[TMP2]], i64 0)
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 12 x float> @llvm.vector.insert.nxv12f32.nxv4f32(<vscale x 12 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], i64 4)
+// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 2
+// CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 12 x float> @llvm.vector.insert.nxv12f32.nxv4f32(<vscale x 12 x float> [[TMP5]], <vscale x 4 x float> [[TMP6]], i64 8)
+// CHECK-NEXT: ret <vscale x 12 x float> [[TMP7]]
+//
+// CPP-CHECK-LABEL: @_Z15test_svld3q_f32u10__SVBool_tPKf(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld3q.sret.nxv4f32(<vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 0
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 12 x float> @llvm.vector.insert.nxv12f32.nxv4f32(<vscale x 12 x float> poison, <vscale x 4 x float> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 1
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 12 x float> @llvm.vector.insert.nxv12f32.nxv4f32(<vscale x 12 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], i64 4)
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 2
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 12 x float> @llvm.vector.insert.nxv12f32.nxv4f32(<vscale x 12 x float> [[TMP5]], <vscale x 4 x float> [[TMP6]], i64 8)
+// CPP-CHECK-NEXT: ret <vscale x 12 x float> [[TMP7]]
+//
+svfloat32x3_t test_svld3q_f32(svbool_t pg, const float32_t *base)
+{
+ return SVE_ACLE_FUNC(svld3q,,_f32,)(pg, base);
+}
+
+// CHECK-LABEL: @test_svld3q_f64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld3q.sret.nxv2f64(<vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 0
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 6 x double> @llvm.vector.insert.nxv6f64.nxv2f64(<vscale x 6 x double> poison, <vscale x 2 x double> [[TMP2]], i64 0)
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 6 x double> @llvm.vector.insert.nxv6f64.nxv2f64(<vscale x 6 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], i64 2)
+// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 2
+// CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 6 x double> @llvm.vector.insert.nxv6f64.nxv2f64(<vscale x 6 x double> [[TMP5]], <vscale x 2 x double> [[TMP6]], i64 4)
+// CHECK-NEXT: ret <vscale x 6 x double> [[TMP7]]
+//
+// CPP-CHECK-LABEL: @_Z15test_svld3q_f64u10__SVBool_tPKd(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld3q.sret.nxv2f64(<vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 0
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 6 x double> @llvm.vector.insert.nxv6f64.nxv2f64(<vscale x 6 x double> poison, <vscale x 2 x double> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 1
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 6 x double> @llvm.vector.insert.nxv6f64.nxv2f64(<vscale x 6 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], i64 2)
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 2
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 6 x double> @llvm.vector.insert.nxv6f64.nxv2f64(<vscale x 6 x double> [[TMP5]], <vscale x 2 x double> [[TMP6]], i64 4)
+// CPP-CHECK-NEXT: ret <vscale x 6 x double> [[TMP7]]
+//
+svfloat64x3_t test_svld3q_f64(svbool_t pg, const float64_t *base)
+{
+ return SVE_ACLE_FUNC(svld3q,,_f64,)(pg, base);
+}
+
+// CHECK-LABEL: @test_svld3q_vnum_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld3q.sret.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 48 x i8> @llvm.vector.insert.nxv48i8.nxv16i8(<vscale x 48 x i8> poison, <vscale x 16 x i8> [[TMP2]], i64 0)
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 48 x i8> @llvm.vector.insert.nxv48i8.nxv16i8(<vscale x 48 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i64 16)
+// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 2
+// CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 48 x i8> @llvm.vector.insert.nxv48i8.nxv16i8(<vscale x 48 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], i64 32)
+// CHECK-NEXT: ret <vscale x 48 x i8> [[TMP7]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svld3q_vnum_u8u10__SVBool_tPKhl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld3q.sret.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 48 x i8> @llvm.vector.insert.nxv48i8.nxv16i8(<vscale x 48 x i8> poison, <vscale x 16 x i8> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 48 x i8> @llvm.vector.insert.nxv48i8.nxv16i8(<vscale x 48 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i64 16)
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 2
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 48 x i8> @llvm.vector.insert.nxv48i8.nxv16i8(<vscale x 48 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], i64 32)
+// CPP-CHECK-NEXT: ret <vscale x 48 x i8> [[TMP7]]
+//
+svuint8x3_t test_svld3q_vnum_u8(svbool_t pg, const uint8_t *base, int64_t vnum)
+{
+ return SVE_ACLE_FUNC(svld3q_vnum_,,u8,)(pg, base, vnum);
+}
+
+// CHECK-LABEL: @test_svld3q_vnum_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld3q.sret.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 48 x i8> @llvm.vector.insert.nxv48i8.nxv16i8(<vscale x 48 x i8> poison, <vscale x 16 x i8> [[TMP2]], i64 0)
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 48 x i8> @llvm.vector.insert.nxv48i8.nxv16i8(<vscale x 48 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i64 16)
+// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 2
+// CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 48 x i8> @llvm.vector.insert.nxv48i8.nxv16i8(<vscale x 48 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], i64 32)
+// CHECK-NEXT: ret <vscale x 48 x i8> [[TMP7]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svld3q_vnum_s8u10__SVBool_tPKal(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld3q.sret.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 48 x i8> @llvm.vector.insert.nxv48i8.nxv16i8(<vscale x 48 x i8> poison, <vscale x 16 x i8> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 48 x i8> @llvm.vector.insert.nxv48i8.nxv16i8(<vscale x 48 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i64 16)
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 2
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 48 x i8> @llvm.vector.insert.nxv48i8.nxv16i8(<vscale x 48 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], i64 32)
+// CPP-CHECK-NEXT: ret <vscale x 48 x i8> [[TMP7]]
+//
+svint8x3_t test_svld3q_vnum_s8(svbool_t pg, const int8_t *base, int64_t vnum)
+{
+ return SVE_ACLE_FUNC(svld3q_vnum,,_s8,)(pg, base, vnum);
+}
+
+// CHECK-LABEL: @test_svld3q_vnum_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld3q.sret.nxv8i16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 0
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 24 x i16> @llvm.vector.insert.nxv24i16.nxv8i16(<vscale x 24 x i16> poison, <vscale x 8 x i16> [[TMP3]], i64 0)
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 1
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 24 x i16> @llvm.vector.insert.nxv24i16.nxv8i16(<vscale x 24 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 8)
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 2
+// CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 24 x i16> @llvm.vector.insert.nxv24i16.nxv8i16(<vscale x 24 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 16)
+// CHECK-NEXT: ret <vscale x 24 x i16> [[TMP8]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svld3q_vnum_u16u10__SVBool_tPKtl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld3q.sret.nxv8i16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 0
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 24 x i16> @llvm.vector.insert.nxv24i16.nxv8i16(<vscale x 24 x i16> poison, <vscale x 8 x i16> [[TMP3]], i64 0)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 1
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 24 x i16> @llvm.vector.insert.nxv24i16.nxv8i16(<vscale x 24 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 8)
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 2
+// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 24 x i16> @llvm.vector.insert.nxv24i16.nxv8i16(<vscale x 24 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 16)
+// CPP-CHECK-NEXT: ret <vscale x 24 x i16> [[TMP8]]
+//
+svuint16x3_t test_svld3q_vnum_u16(svbool_t pg, const uint16_t *base, int64_t vnum)
+{
+ return SVE_ACLE_FUNC(svld3q_vnum,,_u16,)(pg, base, vnum);
+}
+
+// CHECK-LABEL: @test_svld3q_vnum_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld3q.sret.nxv8i16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 0
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 24 x i16> @llvm.vector.insert.nxv24i16.nxv8i16(<vscale x 24 x i16> poison, <vscale x 8 x i16> [[TMP3]], i64 0)
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 1
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 24 x i16> @llvm.vector.insert.nxv24i16.nxv8i16(<vscale x 24 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 8)
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 2
+// CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 24 x i16> @llvm.vector.insert.nxv24i16.nxv8i16(<vscale x 24 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 16)
+// CHECK-NEXT: ret <vscale x 24 x i16> [[TMP8]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svld3q_vnum_s16u10__SVBool_tPKsl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld3q.sret.nxv8i16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 0
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 24 x i16> @llvm.vector.insert.nxv24i16.nxv8i16(<vscale x 24 x i16> poison, <vscale x 8 x i16> [[TMP3]], i64 0)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 1
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 24 x i16> @llvm.vector.insert.nxv24i16.nxv8i16(<vscale x 24 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 8)
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 2
+// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 24 x i16> @llvm.vector.insert.nxv24i16.nxv8i16(<vscale x 24 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 16)
+// CPP-CHECK-NEXT: ret <vscale x 24 x i16> [[TMP8]]
+//
+svint16x3_t test_svld3q_vnum_s16(svbool_t pg, const int16_t *base, int64_t vnum)
+{
+ return SVE_ACLE_FUNC(svld3q_vnum,,_s16,)(pg, base, vnum);
+}
+
+// CHECK-LABEL: @test_svld3q_vnum_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld3q.sret.nxv4i32(<vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 0
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 12 x i32> @llvm.vector.insert.nxv12i32.nxv4i32(<vscale x 12 x i32> poison, <vscale x 4 x i32> [[TMP3]], i64 0)
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 1
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 12 x i32> @llvm.vector.insert.nxv12i32.nxv4i32(<vscale x 12 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 4)
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 2
+// CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 12 x i32> @llvm.vector.insert.nxv12i32.nxv4i32(<vscale x 12 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 8)
+// CHECK-NEXT: ret <vscale x 12 x i32> [[TMP8]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svld3q_vnum_u32u10__SVBool_tPKjl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld3q.sret.nxv4i32(<vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 0
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 12 x i32> @llvm.vector.insert.nxv12i32.nxv4i32(<vscale x 12 x i32> poison, <vscale x 4 x i32> [[TMP3]], i64 0)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 1
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 12 x i32> @llvm.vector.insert.nxv12i32.nxv4i32(<vscale x 12 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 4)
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 2
+// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 12 x i32> @llvm.vector.insert.nxv12i32.nxv4i32(<vscale x 12 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 8)
+// CPP-CHECK-NEXT: ret <vscale x 12 x i32> [[TMP8]]
+//
+svuint32x3_t test_svld3q_vnum_u32(svbool_t pg, const uint32_t *base, int64_t vnum)
+{
+ return SVE_ACLE_FUNC(svld3q_vnum,,_u32,)(pg, base, vnum);
+}
+
+// CHECK-LABEL: @test_svld3q_vnum_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld3q.sret.nxv4i32(<vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 0
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 12 x i32> @llvm.vector.insert.nxv12i32.nxv4i32(<vscale x 12 x i32> poison, <vscale x 4 x i32> [[TMP3]], i64 0)
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 1
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 12 x i32> @llvm.vector.insert.nxv12i32.nxv4i32(<vscale x 12 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 4)
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 2
+// CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 12 x i32> @llvm.vector.insert.nxv12i32.nxv4i32(<vscale x 12 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 8)
+// CHECK-NEXT: ret <vscale x 12 x i32> [[TMP8]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svld3q_vnum_s32u10__SVBool_tPKil(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld3q.sret.nxv4i32(<vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 0
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 12 x i32> @llvm.vector.insert.nxv12i32.nxv4i32(<vscale x 12 x i32> poison, <vscale x 4 x i32> [[TMP3]], i64 0)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 1
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 12 x i32> @llvm.vector.insert.nxv12i32.nxv4i32(<vscale x 12 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 4)
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 2
+// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 12 x i32> @llvm.vector.insert.nxv12i32.nxv4i32(<vscale x 12 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 8)
+// CPP-CHECK-NEXT: ret <vscale x 12 x i32> [[TMP8]]
+//
+svint32x3_t test_svld3q_vnum_s32(svbool_t pg, const int32_t *base, int64_t vnum)
+{
+ return SVE_ACLE_FUNC(svld3q_vnum,,_s32,)(pg, base, vnum);
+}
+
+// CHECK-LABEL: @test_svld3q_vnum_u64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3q.sret.nxv2i64(<vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 0
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 6 x i64> @llvm.vector.insert.nxv6i64.nxv2i64(<vscale x 6 x i64> poison, <vscale x 2 x i64> [[TMP3]], i64 0)
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 1
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 6 x i64> @llvm.vector.insert.nxv6i64.nxv2i64(<vscale x 6 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 2)
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 2
+// CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 6 x i64> @llvm.vector.insert.nxv6i64.nxv2i64(<vscale x 6 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 4)
+// CHECK-NEXT: ret <vscale x 6 x i64> [[TMP8]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svld3q_vnum_u64u10__SVBool_tPKml(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3q.sret.nxv2i64(<vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 0
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 6 x i64> @llvm.vector.insert.nxv6i64.nxv2i64(<vscale x 6 x i64> poison, <vscale x 2 x i64> [[TMP3]], i64 0)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 1
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 6 x i64> @llvm.vector.insert.nxv6i64.nxv2i64(<vscale x 6 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 2)
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 2
+// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 6 x i64> @llvm.vector.insert.nxv6i64.nxv2i64(<vscale x 6 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 4)
+// CPP-CHECK-NEXT: ret <vscale x 6 x i64> [[TMP8]]
+//
+svuint64x3_t test_svld3q_vnum_u64(svbool_t pg, const uint64_t *base, int64_t vnum)
+{
+ return SVE_ACLE_FUNC(svld3q_vnum,,_u64,)(pg, base, vnum);
+}
+
+// CHECK-LABEL: @test_svld3q_vnum_s64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3q.sret.nxv2i64(<vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 0
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 6 x i64> @llvm.vector.insert.nxv6i64.nxv2i64(<vscale x 6 x i64> poison, <vscale x 2 x i64> [[TMP3]], i64 0)
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 1
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 6 x i64> @llvm.vector.insert.nxv6i64.nxv2i64(<vscale x 6 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 2)
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 2
+// CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 6 x i64> @llvm.vector.insert.nxv6i64.nxv2i64(<vscale x 6 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 4)
+// CHECK-NEXT: ret <vscale x 6 x i64> [[TMP8]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svld3q_vnum_s64u10__SVBool_tPKll(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3q.sret.nxv2i64(<vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 0
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 6 x i64> @llvm.vector.insert.nxv6i64.nxv2i64(<vscale x 6 x i64> poison, <vscale x 2 x i64> [[TMP3]], i64 0)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 1
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 6 x i64> @llvm.vector.insert.nxv6i64.nxv2i64(<vscale x 6 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 2)
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 2
+// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 6 x i64> @llvm.vector.insert.nxv6i64.nxv2i64(<vscale x 6 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 4)
+// CPP-CHECK-NEXT: ret <vscale x 6 x i64> [[TMP8]]
+//
+svint64x3_t test_svld3q_vnum_s64(svbool_t pg, const int64_t *base, int64_t vnum)
+{
+ return SVE_ACLE_FUNC(svld3q_vnum,,_s64,)(pg, base, vnum);
+}
+
+// CHECK-LABEL: @test_svld3q_vnum_f16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld3q.sret.nxv8f16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], 0
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 24 x half> @llvm.vector.insert.nxv24f16.nxv8f16(<vscale x 24 x half> poison, <vscale x 8 x half> [[TMP3]], i64 0)
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], 1
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 24 x half> @llvm.vector.insert.nxv24f16.nxv8f16(<vscale x 24 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 8)
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], 2
+// CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 24 x half> @llvm.vector.insert.nxv24f16.nxv8f16(<vscale x 24 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 16)
+// CHECK-NEXT: ret <vscale x 24 x half> [[TMP8]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svld3q_vnum_f16u10__SVBool_tPKDhl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld3q.sret.nxv8f16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], 0
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 24 x half> @llvm.vector.insert.nxv24f16.nxv8f16(<vscale x 24 x half> poison, <vscale x 8 x half> [[TMP3]], i64 0)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], 1
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 24 x half> @llvm.vector.insert.nxv24f16.nxv8f16(<vscale x 24 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 8)
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], 2
+// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 24 x half> @llvm.vector.insert.nxv24f16.nxv8f16(<vscale x 24 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 16)
+// CPP-CHECK-NEXT: ret <vscale x 24 x half> [[TMP8]]
+//
+svfloat16x3_t test_svld3q_vnum_f16(svbool_t pg, const float16_t *base, int64_t vnum)
+{
+ return SVE_ACLE_FUNC(svld3q_vnum,,_f16,)(pg, base, vnum);
+}
+
+// CHECK-LABEL: @test_svld3q_vnum_bf16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld3q.sret.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 24 x bfloat> @llvm.vector.insert.nxv24bf16.nxv8bf16(<vscale x 24 x bfloat> poison, <vscale x 8 x bfloat> [[TMP3]], i64 0)
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 24 x bfloat> @llvm.vector.insert.nxv24bf16.nxv8bf16(<vscale x 24 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 8)
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
+// CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 24 x bfloat> @llvm.vector.insert.nxv24bf16.nxv8bf16(<vscale x 24 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 16)
+// CHECK-NEXT: ret <vscale x 24 x bfloat> [[TMP8]]
+//
+// CPP-CHECK-LABEL: @_Z21test_svld3q_vnum_bf16u10__SVBool_tPKu6__bf16l(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld3q.sret.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 24 x bfloat> @llvm.vector.insert.nxv24bf16.nxv8bf16(<vscale x 24 x bfloat> poison, <vscale x 8 x bfloat> [[TMP3]], i64 0)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 24 x bfloat> @llvm.vector.insert.nxv24bf16.nxv8bf16(<vscale x 24 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 8)
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
+// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 24 x bfloat> @llvm.vector.insert.nxv24bf16.nxv8bf16(<vscale x 24 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 16)
+// CPP-CHECK-NEXT: ret <vscale x 24 x bfloat> [[TMP8]]
+//
+svbfloat16x3_t test_svld3q_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum)
+{
+ return SVE_ACLE_FUNC(svld3q_vnum,,_bf16,)(pg, base, vnum);
+}
+
+// CHECK-LABEL: @test_svld3q_vnum_f32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld3q.sret.nxv4f32(<vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 0
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 12 x float> @llvm.vector.insert.nxv12f32.nxv4f32(<vscale x 12 x float> poison, <vscale x 4 x float> [[TMP3]], i64 0)
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 1
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 12 x float> @llvm.vector.insert.nxv12f32.nxv4f32(<vscale x 12 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 4)
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 2
+// CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 12 x float> @llvm.vector.insert.nxv12f32.nxv4f32(<vscale x 12 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 8)
+// CHECK-NEXT: ret <vscale x 12 x float> [[TMP8]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svld3q_vnum_f32u10__SVBool_tPKfl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld3q.sret.nxv4f32(<vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 0
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 12 x float> @llvm.vector.insert.nxv12f32.nxv4f32(<vscale x 12 x float> poison, <vscale x 4 x float> [[TMP3]], i64 0)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 1
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 12 x float> @llvm.vector.insert.nxv12f32.nxv4f32(<vscale x 12 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 4)
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 2
+// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 12 x float> @llvm.vector.insert.nxv12f32.nxv4f32(<vscale x 12 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 8)
+// CPP-CHECK-NEXT: ret <vscale x 12 x float> [[TMP8]]
+//
+svfloat32x3_t test_svld3q_vnum_f32(svbool_t pg, const float32_t *base, int64_t vnum)
+{
+ return SVE_ACLE_FUNC(svld3q_vnum,,_f32,)(pg, base, vnum);
+}
+
+// CHECK-LABEL: @test_svld3q_vnum_f64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld3q.sret.nxv2f64(<vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], 0
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 6 x double> @llvm.vector.insert.nxv6f64.nxv2f64(<vscale x 6 x double> poison, <vscale x 2 x double> [[TMP3]], i64 0)
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], 1
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 6 x double> @llvm.vector.insert.nxv6f64.nxv2f64(<vscale x 6 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 2)
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], 2
+// CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 6 x double> @llvm.vector.insert.nxv6f64.nxv2f64(<vscale x 6 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 4)
+// CHECK-NEXT: ret <vscale x 6 x double> [[TMP8]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svld3q_vnum_f64u10__SVBool_tPKdl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld3q.sret.nxv2f64(<vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], 0
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 6 x double> @llvm.vector.insert.nxv6f64.nxv2f64(<vscale x 6 x double> poison, <vscale x 2 x double> [[TMP3]], i64 0)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], 1
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 6 x double> @llvm.vector.insert.nxv6f64.nxv2f64(<vscale x 6 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 2)
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], 2
+// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 6 x double> @llvm.vector.insert.nxv6f64.nxv2f64(<vscale x 6 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 4)
+// CPP-CHECK-NEXT: ret <vscale x 6 x double> [[TMP8]]
+//
+svfloat64x3_t test_svld3q_vnum_f64(svbool_t pg, const float64_t *base, int64_t vnum)
+{
+ return SVE_ACLE_FUNC(svld3q_vnum,,_f64,)(pg, base, vnum);
+}
+
+// CHECK-LABEL: @test_svld4q_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld4q.sret.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], ptr [[BASE:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
+// CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
+// CHECK-NEXT: ret <vscale x 64 x i8> [[TMP8]]
+//
+// CPP-CHECK-LABEL: @_Z14test_svld4q_u8u10__SVBool_tPKh(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld4q.sret.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
+// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
+// CPP-CHECK-NEXT: ret <vscale x 64 x i8> [[TMP8]]
+//
+svuint8x4_t test_svld4q_u8(svbool_t pg, const uint8_t *base)
+{
+ return SVE_ACLE_FUNC(svld4q,,_u8,)(pg, base);
+}
+
+// CHECK-LABEL: @test_svld4q_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld4q.sret.nxv8i16(<vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 0
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP2]], i64 0)
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i64 8)
+// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 2
+// CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], i64 16)
+// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 3
+// CHECK-NEXT: [[TMP9:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP7]], <vscale x 8 x i16> [[TMP8]], i64 24)
+// CHECK-NEXT: ret <vscale x 32 x i16> [[TMP9]]
+//
+// CPP-CHECK-LABEL: @_Z15test_svld4q_u16u10__SVBool_tPKt(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld4q.sret.nxv8i16(<vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 0
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 1
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i64 8)
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 2
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], i64 16)
+// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 3
+// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP7]], <vscale x 8 x i16> [[TMP8]], i64 24)
+// CPP-CHECK-NEXT: ret <vscale x 32 x i16> [[TMP9]]
+//
+svuint16x4_t test_svld4q_u16(svbool_t pg, const uint16_t *base)
+{
+ return SVE_ACLE_FUNC(svld4q,,_u16,)(pg, base);
+}
+
+// CHECK-LABEL: @test_svld4q_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld4q.sret.nxv8i16(<vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 0
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP2]], i64 0)
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i64 8)
+// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 2
+// CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], i64 16)
+// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 3
+// CHECK-NEXT: [[TMP9:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP7]], <vscale x 8 x i16> [[TMP8]], i64 24)
+// CHECK-NEXT: ret <vscale x 32 x i16> [[TMP9]]
+//
+// CPP-CHECK-LABEL: @_Z15test_svld4q_s16u10__SVBool_tPKs(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld4q.sret.nxv8i16(<vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 0
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 1
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i64 8)
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 2
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], i64 16)
+// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 3
+// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP7]], <vscale x 8 x i16> [[TMP8]], i64 24)
+// CPP-CHECK-NEXT: ret <vscale x 32 x i16> [[TMP9]]
+//
+svint16x4_t test_svld4q_s16(svbool_t pg, const int16_t *base)
+{
+ return SVE_ACLE_FUNC(svld4q,,_s16,)(pg, base);
+}
+
+// CHECK-LABEL: @test_svld4q_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4q.sret.nxv4i32(<vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 0
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP2]], i64 0)
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], i64 4)
+// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 2
+// CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP6]], i64 8)
+// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 3
+// CHECK-NEXT: [[TMP9:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP7]], <vscale x 4 x i32> [[TMP8]], i64 12)
+// CHECK-NEXT: ret <vscale x 16 x i32> [[TMP9]]
+//
+// CPP-CHECK-LABEL: @_Z15test_svld4q_u32u10__SVBool_tPKj(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4q.sret.nxv4i32(<vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 0
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 1
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], i64 4)
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 2
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP6]], i64 8)
+// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 3
+// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP7]], <vscale x 4 x i32> [[TMP8]], i64 12)
+// CPP-CHECK-NEXT: ret <vscale x 16 x i32> [[TMP9]]
+//
+svuint32x4_t test_svld4q_u32(svbool_t pg, const uint32_t *base)
+{
+ return SVE_ACLE_FUNC(svld4q,,_u32,)(pg, base);
+}
+
+// CHECK-LABEL: @test_svld4q_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4q.sret.nxv4i32(<vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 0
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP2]], i64 0)
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], i64 4)
+// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 2
+// CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP6]], i64 8)
+// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 3
+// CHECK-NEXT: [[TMP9:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP7]], <vscale x 4 x i32> [[TMP8]], i64 12)
+// CHECK-NEXT: ret <vscale x 16 x i32> [[TMP9]]
+//
+// CPP-CHECK-LABEL: @_Z15test_svld4q_s32u10__SVBool_tPKi(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4q.sret.nxv4i32(<vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 0
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 1
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], i64 4)
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 2
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP6]], i64 8)
+// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 3
+// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP7]], <vscale x 4 x i32> [[TMP8]], i64 12)
+// CPP-CHECK-NEXT: ret <vscale x 16 x i32> [[TMP9]]
+//
+svint32x4_t test_svld4q_s32(svbool_t pg, const int32_t *base)
+{
+ return SVE_ACLE_FUNC(svld4q,,_s32,)(pg, base);
+}
+
+// CHECK-LABEL: @test_svld4q_u64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld4q.sret.nxv2i64(<vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 0
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP2]], i64 0)
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], i64 2)
+// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 2
+// CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP6]], i64 4)
+// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 3
+// CHECK-NEXT: [[TMP9:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP8]], i64 6)
+// CHECK-NEXT: ret <vscale x 8 x i64> [[TMP9]]
+//
+// CPP-CHECK-LABEL: @_Z15test_svld4q_u64u10__SVBool_tPKm(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld4q.sret.nxv2i64(<vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 0
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 1
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], i64 2)
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 2
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP6]], i64 4)
+// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 3
+// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP8]], i64 6)
+// CPP-CHECK-NEXT: ret <vscale x 8 x i64> [[TMP9]]
+//
+svuint64x4_t test_svld4q_u64(svbool_t pg, const uint64_t *base)
+{
+ return SVE_ACLE_FUNC(svld4q,,_u64,)(pg, base);
+}
+
+// CHECK-LABEL: @test_svld4q_s64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld4q.sret.nxv2i64(<vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 0
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP2]], i64 0)
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], i64 2)
+// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 2
+// CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP6]], i64 4)
+// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 3
+// CHECK-NEXT: [[TMP9:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP8]], i64 6)
+// CHECK-NEXT: ret <vscale x 8 x i64> [[TMP9]]
+//
+// CPP-CHECK-LABEL: @_Z15test_svld4q_s64u10__SVBool_tPKl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld4q.sret.nxv2i64(<vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 0
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 1
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], i64 2)
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 2
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP6]], i64 4)
+// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 3
+// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP8]], i64 6)
+// CPP-CHECK-NEXT: ret <vscale x 8 x i64> [[TMP9]]
+//
+svint64x4_t test_svld4q_s64(svbool_t pg, const int64_t *base)
+{
+ return SVE_ACLE_FUNC(svld4q,,_s64,)(pg, base);
+}
+
+// CHECK-LABEL: @test_svld4q_f16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld4q.sret.nxv8f16(<vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 0
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP2]], i64 0)
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], i64 8)
+// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 2
+// CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], i64 16)
+// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 3
+// CHECK-NEXT: [[TMP9:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP7]], <vscale x 8 x half> [[TMP8]], i64 24)
+// CHECK-NEXT: ret <vscale x 32 x half> [[TMP9]]
+//
+// CPP-CHECK-LABEL: @_Z15test_svld4q_f16u10__SVBool_tPKDh(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld4q.sret.nxv8f16(<vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 0
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 1
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], i64 8)
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 2
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], i64 16)
+// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 3
+// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP7]], <vscale x 8 x half> [[TMP8]], i64 24)
+// CPP-CHECK-NEXT: ret <vscale x 32 x half> [[TMP9]]
+//
+svfloat16x4_t test_svld4q_f16(svbool_t pg, const float16_t *base)
+{
+ return SVE_ACLE_FUNC(svld4q,,_f16,)(pg, base);
+}
+
+// CHECK-LABEL: @test_svld4q_bf16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld4q.sret.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP2]], i64 0)
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], i64 8)
+// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 2
+// CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP5]], <vscale x 8 x bfloat> [[TMP6]], i64 16)
+// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 3
+// CHECK-NEXT: [[TMP9:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP7]], <vscale x 8 x bfloat> [[TMP8]], i64 24)
+// CHECK-NEXT: ret <vscale x 32 x bfloat> [[TMP9]]
+//
+// CPP-CHECK-LABEL: @_Z16test_svld4q_bf16u10__SVBool_tPKu6__bf16(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld4q.sret.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], i64 8)
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 2
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP5]], <vscale x 8 x bfloat> [[TMP6]], i64 16)
+// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 3
+// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP7]], <vscale x 8 x bfloat> [[TMP8]], i64 24)
+// CPP-CHECK-NEXT: ret <vscale x 32 x bfloat> [[TMP9]]
+//
+svbfloat16x4_t test_svld4q_bf16(svbool_t pg, const bfloat16_t *base)
+{
+ return SVE_ACLE_FUNC(svld4q,,_bf16,)(pg, base);
+}
+
+// CHECK-LABEL: @test_svld4q_f32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld4q.sret.nxv4f32(<vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 0
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP2]], i64 0)
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], i64 4)
+// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 2
+// CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP5]], <vscale x 4 x float> [[TMP6]], i64 8)
+// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 3
+// CHECK-NEXT: [[TMP9:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP7]], <vscale x 4 x float> [[TMP8]], i64 12)
+// CHECK-NEXT: ret <vscale x 16 x float> [[TMP9]]
+//
+// CPP-CHECK-LABEL: @_Z15test_svld4q_f32u10__SVBool_tPKf(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld4q.sret.nxv4f32(<vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 0
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 1
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], i64 4)
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 2
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP5]], <vscale x 4 x float> [[TMP6]], i64 8)
+// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 3
+// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP7]], <vscale x 4 x float> [[TMP8]], i64 12)
+// CPP-CHECK-NEXT: ret <vscale x 16 x float> [[TMP9]]
+//
+svfloat32x4_t test_svld4q_f32(svbool_t pg, const float32_t *base)
+{
+ return SVE_ACLE_FUNC(svld4q,,_f32,)(pg, base);
+}
+
+// CHECK-LABEL: @test_svld4q_f64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld4q.sret.nxv2f64(<vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 0
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP2]], i64 0)
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], i64 2)
+// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 2
+// CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP5]], <vscale x 2 x double> [[TMP6]], i64 4)
+// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 3
+// CHECK-NEXT: [[TMP9:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP7]], <vscale x 2 x double> [[TMP8]], i64 6)
+// CHECK-NEXT: ret <vscale x 8 x double> [[TMP9]]
+//
+// CPP-CHECK-LABEL: @_Z15test_svld4q_f64u10__SVBool_tPKd(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld4q.sret.nxv2f64(<vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 0
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 1
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], i64 2)
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 2
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP5]], <vscale x 2 x double> [[TMP6]], i64 4)
+// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 3
+// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP7]], <vscale x 2 x double> [[TMP8]], i64 6)
+// CPP-CHECK-NEXT: ret <vscale x 8 x double> [[TMP9]]
+//
+svfloat64x4_t test_svld4q_f64(svbool_t pg, const float64_t *base)
+{
+ return SVE_ACLE_FUNC(svld4q,,_f64,)(pg, base);
+}
+
+// CHECK-LABEL: @test_svld4q_vnum_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld4q.sret.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP2]], i64 0)
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i64 16)
+// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 2
+// CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], i64 32)
+// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 3
+// CHECK-NEXT: [[TMP9:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP7]], <vscale x 16 x i8> [[TMP8]], i64 48)
+// CHECK-NEXT: ret <vscale x 64 x i8> [[TMP9]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svld4q_vnum_u8u10__SVBool_tPKhl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld4q.sret.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i64 16)
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 2
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], i64 32)
+// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 3
+// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP7]], <vscale x 16 x i8> [[TMP8]], i64 48)
+// CPP-CHECK-NEXT: ret <vscale x 64 x i8> [[TMP9]]
+//
+svuint8x4_t test_svld4q_vnum_u8(svbool_t pg, const uint8_t *base, int64_t vnum)
+{
+ return SVE_ACLE_FUNC(svld4q_vnum_,,u8,)(pg, base, vnum);
+}
+
+// CHECK-LABEL: @test_svld4q_vnum_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld4q.sret.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP2]], i64 0)
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i64 16)
+// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 2
+// CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], i64 32)
+// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 3
+// CHECK-NEXT: [[TMP9:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP7]], <vscale x 16 x i8> [[TMP8]], i64 48)
+// CHECK-NEXT: ret <vscale x 64 x i8> [[TMP9]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svld4q_vnum_s8u10__SVBool_tPKal(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld4q.sret.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i64 16)
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 2
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], i64 32)
+// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 3
+// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP7]], <vscale x 16 x i8> [[TMP8]], i64 48)
+// CPP-CHECK-NEXT: ret <vscale x 64 x i8> [[TMP9]]
+//
+svint8x4_t test_svld4q_vnum_s8(svbool_t pg, const int8_t *base, int64_t vnum)
+{
+ return SVE_ACLE_FUNC(svld4q_vnum,,_s8,)(pg, base, vnum);
+}
+// CHECK-LABEL: @test_svld4q_vnum_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld4q.sret.nxv8i16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 0
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP3]], i64 0)
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 1
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 8)
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 2
+// CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 16)
+// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 3
+// CHECK-NEXT: [[TMP10:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP8]], <vscale x 8 x i16> [[TMP9]], i64 24)
+// CHECK-NEXT: ret <vscale x 32 x i16> [[TMP10]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svld4q_vnum_u16u10__SVBool_tPKtl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld4q.sret.nxv8i16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 0
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP3]], i64 0)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 1
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 8)
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 2
+// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 16)
+// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 3
+// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP8]], <vscale x 8 x i16> [[TMP9]], i64 24)
+// CPP-CHECK-NEXT: ret <vscale x 32 x i16> [[TMP10]]
+//
+svuint16x4_t test_svld4q_vnum_u16(svbool_t pg, const uint16_t *base, int64_t vnum)
+{
+ return SVE_ACLE_FUNC(svld4q_vnum,,_u16,)(pg, base, vnum);
+}
+
+// CHECK-LABEL: @test_svld4q_vnum_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld4q.sret.nxv8i16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 0
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP3]], i64 0)
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 1
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 8)
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 2
+// CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 16)
+// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 3
+// CHECK-NEXT: [[TMP10:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP8]], <vscale x 8 x i16> [[TMP9]], i64 24)
+// CHECK-NEXT: ret <vscale x 32 x i16> [[TMP10]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svld4q_vnum_s16u10__SVBool_tPKsl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld4q.sret.nxv8i16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 0
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP3]], i64 0)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 1
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 8)
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 2
+// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 16)
+// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 3
+// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP8]], <vscale x 8 x i16> [[TMP9]], i64 24)
+// CPP-CHECK-NEXT: ret <vscale x 32 x i16> [[TMP10]]
+//
+svint16x4_t test_svld4q_vnum_s16(svbool_t pg, const int16_t *base, int64_t vnum)
+{
+ return SVE_ACLE_FUNC(svld4q_vnum,,_s16,)(pg, base, vnum);
+}
+
+// CHECK-LABEL: @test_svld4q_vnum_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4q.sret.nxv4i32(<vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 0
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP3]], i64 0)
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 1
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 4)
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 2
+// CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 8)
+// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 3
+// CHECK-NEXT: [[TMP10:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP8]], <vscale x 4 x i32> [[TMP9]], i64 12)
+// CHECK-NEXT: ret <vscale x 16 x i32> [[TMP10]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svld4q_vnum_u32u10__SVBool_tPKjl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4q.sret.nxv4i32(<vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 0
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP3]], i64 0)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 1
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 4)
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 2
+// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 8)
+// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 3
+// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP8]], <vscale x 4 x i32> [[TMP9]], i64 12)
+// CPP-CHECK-NEXT: ret <vscale x 16 x i32> [[TMP10]]
+//
+svuint32x4_t test_svld4q_vnum_u32(svbool_t pg, const uint32_t *base, int64_t vnum)
+{
+ return SVE_ACLE_FUNC(svld4q_vnum,,_u32,)(pg, base, vnum);
+}
+
+// CHECK-LABEL: @test_svld4q_vnum_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4q.sret.nxv4i32(<vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 0
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP3]], i64 0)
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 1
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 4)
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 2
+// CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 8)
+// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 3
+// CHECK-NEXT: [[TMP10:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP8]], <vscale x 4 x i32> [[TMP9]], i64 12)
+// CHECK-NEXT: ret <vscale x 16 x i32> [[TMP10]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svld4q_vnum_s32u10__SVBool_tPKil(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4q.sret.nxv4i32(<vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 0
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP3]], i64 0)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 1
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 4)
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 2
+// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 8)
+// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 3
+// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP8]], <vscale x 4 x i32> [[TMP9]], i64 12)
+// CPP-CHECK-NEXT: ret <vscale x 16 x i32> [[TMP10]]
+//
+svint32x4_t test_svld4q_vnum_s32(svbool_t pg, const int32_t *base, int64_t vnum)
+{
+ return SVE_ACLE_FUNC(svld4q_vnum,,_s32,)(pg, base, vnum);
+}
+
+// CHECK-LABEL: @test_svld4q_vnum_u64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld4q.sret.nxv2i64(<vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 0
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP3]], i64 0)
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 1
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 2)
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 2
+// CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 4)
+// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 3
+// CHECK-NEXT: [[TMP10:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP8]], <vscale x 2 x i64> [[TMP9]], i64 6)
+// CHECK-NEXT: ret <vscale x 8 x i64> [[TMP10]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svld4q_vnum_u64u10__SVBool_tPKml(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld4q.sret.nxv2i64(<vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 0
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP3]], i64 0)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 1
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 2)
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 2
+// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 4)
+// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 3
+// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP8]], <vscale x 2 x i64> [[TMP9]], i64 6)
+// CPP-CHECK-NEXT: ret <vscale x 8 x i64> [[TMP10]]
+//
+svuint64x4_t test_svld4q_vnum_u64(svbool_t pg, const uint64_t *base, int64_t vnum)
+{
+ return SVE_ACLE_FUNC(svld4q_vnum,,_u64,)(pg, base, vnum);
+}
+
+// CHECK-LABEL: @test_svld4q_vnum_s64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld4q.sret.nxv2i64(<vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 0
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP3]], i64 0)
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 1
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 2)
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 2
+// CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 4)
+// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 3
+// CHECK-NEXT: [[TMP10:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP8]], <vscale x 2 x i64> [[TMP9]], i64 6)
+// CHECK-NEXT: ret <vscale x 8 x i64> [[TMP10]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svld4q_vnum_s64u10__SVBool_tPKll(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld4q.sret.nxv2i64(<vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 0
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP3]], i64 0)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 1
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 2)
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 2
+// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 4)
+// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 3
+// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP8]], <vscale x 2 x i64> [[TMP9]], i64 6)
+// CPP-CHECK-NEXT: ret <vscale x 8 x i64> [[TMP10]]
+//
+svint64x4_t test_svld4q_vnum_s64(svbool_t pg, const int64_t *base, int64_t vnum)
+{
+ return SVE_ACLE_FUNC(svld4q_vnum,,_s64,)(pg, base, vnum);
+}
+
+// CHECK-LABEL: @test_svld4q_vnum_f16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld4q.sret.nxv8f16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], 0
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP3]], i64 0)
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], 1
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 8)
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], 2
+// CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 16)
+// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], 3
+// CHECK-NEXT: [[TMP10:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP8]], <vscale x 8 x half> [[TMP9]], i64 24)
+// CHECK-NEXT: ret <vscale x 32 x half> [[TMP10]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svld4q_vnum_f16u10__SVBool_tPKDhl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld4q.sret.nxv8f16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], 0
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP3]], i64 0)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], 1
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 8)
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], 2
+// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 16)
+// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], 3
+// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP8]], <vscale x 8 x half> [[TMP9]], i64 24)
+// CPP-CHECK-NEXT: ret <vscale x 32 x half> [[TMP10]]
+//
+svfloat16x4_t test_svld4q_vnum_f16(svbool_t pg, const float16_t *base, int64_t vnum)
+{
+ return SVE_ACLE_FUNC(svld4q_vnum,,_f16,)(pg, base, vnum);
+}
+
+// CHECK-LABEL: @test_svld4q_vnum_bf16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld4q.sret.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP3]], i64 0)
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 8)
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
+// CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 16)
+// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 3
+// CHECK-NEXT: [[TMP10:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP8]], <vscale x 8 x bfloat> [[TMP9]], i64 24)
+// CHECK-NEXT: ret <vscale x 32 x bfloat> [[TMP10]]
+//
+// CPP-CHECK-LABEL: @_Z21test_svld4q_vnum_bf16u10__SVBool_tPKu6__bf16l(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld4q.sret.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP3]], i64 0)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 8)
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
+// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 16)
+// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 3
+// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP8]], <vscale x 8 x bfloat> [[TMP9]], i64 24)
+// CPP-CHECK-NEXT: ret <vscale x 32 x bfloat> [[TMP10]]
+//
+svbfloat16x4_t test_svld4q_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum)
+{
+ return SVE_ACLE_FUNC(svld4q_vnum,,_bf16,)(pg, base, vnum);
+}
+
+// CHECK-LABEL: @test_svld4q_vnum_f32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld4q.sret.nxv4f32(<vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 0
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP3]], i64 0)
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 1
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 4)
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 2
+// CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 8)
+// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 3
+// CHECK-NEXT: [[TMP10:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP8]], <vscale x 4 x float> [[TMP9]], i64 12)
+// CHECK-NEXT: ret <vscale x 16 x float> [[TMP10]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svld4q_vnum_f32u10__SVBool_tPKfl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld4q.sret.nxv4f32(<vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 0
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP3]], i64 0)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 1
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 4)
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 2
+// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 8)
+// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 3
+// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP8]], <vscale x 4 x float> [[TMP9]], i64 12)
+// CPP-CHECK-NEXT: ret <vscale x 16 x float> [[TMP10]]
+//
+svfloat32x4_t test_svld4q_vnum_f32(svbool_t pg, const float32_t *base, int64_t vnum)
+{
+ return SVE_ACLE_FUNC(svld4q_vnum,,_f32,)(pg, base, vnum);
+}
+
+// CHECK-LABEL: @test_svld4q_vnum_f64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld4q.sret.nxv2f64(<vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], 0
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP3]], i64 0)
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], 1
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 2)
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], 2
+// CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 4)
+// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], 3
+// CHECK-NEXT: [[TMP10:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP8]], <vscale x 2 x double> [[TMP9]], i64 6)
+// CHECK-NEXT: ret <vscale x 8 x double> [[TMP10]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svld4q_vnum_f64u10__SVBool_tPKdl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld4q.sret.nxv2f64(<vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], 0
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP3]], i64 0)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], 1
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 2)
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], 2
+// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 4)
+// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], 3
+// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP8]], <vscale x 2 x double> [[TMP9]], i64 6)
+// CPP-CHECK-NEXT: ret <vscale x 8 x double> [[TMP10]]
+//
+svfloat64x4_t test_svld4q_vnum_f64(svbool_t pg, const float64_t *base, int64_t vnum)
+{
+ return SVE_ACLE_FUNC(svld4q_vnum,,_f64,)(pg, base, vnum);
+}
+
+
+// Gather for 128 bits
+// vector base + scalar offset
+// CHECK-LABEL: @test_svld1q_gather_u64base_offset_u64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[OFFSET:%.*]])
+// CHECK-NEXT: ret <vscale x 2 x i64> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z37test_svld1q_gather_u64base_offset_u64u10__SVBool_tu12__SVUint64_tl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[OFFSET:%.*]])
+// CPP-CHECK-NEXT: ret <vscale x 2 x i64> [[TMP1]]
+//
+svuint64_t test_svld1q_gather_u64base_offset_u64(svbool_t pg, svuint64_t base, int64_t offset)
+{
+ return SVE_ACLE_FUNC(svld1q_gather,_u64base,_offset_u64,)(pg, base, offset);
+}
+
+// CHECK-LABEL: @test_svld1q_gather_u64base_offset_s64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[OFFSET:%.*]])
+// CHECK-NEXT: ret <vscale x 2 x i64> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z37test_svld1q_gather_u64base_offset_s64u10__SVBool_tu12__SVUint64_tl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[OFFSET:%.*]])
+// CPP-CHECK-NEXT: ret <vscale x 2 x i64> [[TMP1]]
+//
+svint64_t test_svld1q_gather_u64base_offset_s64(svbool_t pg, svuint64_t base, int64_t offset)
+{
+ return SVE_ACLE_FUNC(svld1q_gather,_u64base,_offset_s64,)(pg, base, offset);
+}
+
+// CHECK-LABEL: @test_svld1q_gather_u64base_offset_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv4i32.nxv2i64(<vscale x 4 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[OFFSET:%.*]])
+// CHECK-NEXT: ret <vscale x 4 x i32> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z37test_svld1q_gather_u64base_offset_u32u10__SVBool_tu12__SVUint64_tl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv4i32.nxv2i64(<vscale x 4 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[OFFSET:%.*]])
+// CPP-CHECK-NEXT: ret <vscale x 4 x i32> [[TMP1]]
+//
+svuint32_t test_svld1q_gather_u64base_offset_u32(svbool_t pg, svuint64_t base, int64_t offset)
+{
+ return SVE_ACLE_FUNC(svld1q_gather,_u64base,_offset_u32,)(pg, base, offset);
+}
+
+// CHECK-LABEL: @test_svld1q_gather_u64base_offset_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv4i32.nxv2i64(<vscale x 4 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[OFFSET:%.*]])
+// CHECK-NEXT: ret <vscale x 4 x i32> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z37test_svld1q_gather_u64base_offset_s32u10__SVBool_tu12__SVUint64_tl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv4i32.nxv2i64(<vscale x 4 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[OFFSET:%.*]])
+// CPP-CHECK-NEXT: ret <vscale x 4 x i32> [[TMP1]]
+//
+svint32_t test_svld1q_gather_u64base_offset_s32(svbool_t pg, svuint64_t base, int64_t offset)
+{
+ return SVE_ACLE_FUNC(svld1q_gather,_u64base,_offset_s32,)(pg, base, offset);
+}
+
+// CHECK-LABEL: @test_svld1q_gather_u64base_offset_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8i16.nxv2i64(<vscale x 8 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[OFFSET:%.*]])
+// CHECK-NEXT: ret <vscale x 8 x i16> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z37test_svld1q_gather_u64base_offset_u16u10__SVBool_tu12__SVUint64_tl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8i16.nxv2i64(<vscale x 8 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[OFFSET:%.*]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP1]]
+//
+svuint16_t test_svld1q_gather_u64base_offset_u16(svbool_t pg, svuint64_t base, int64_t offset)
+{
+ return SVE_ACLE_FUNC(svld1q_gather,_u64base,_offset_u16,)(pg, base, offset);
+}
+
+// CHECK-LABEL: @test_svld1q_gather_u64base_offset_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8i16.nxv2i64(<vscale x 8 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[OFFSET:%.*]])
+// CHECK-NEXT: ret <vscale x 8 x i16> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z37test_svld1q_gather_u64base_offset_s16u10__SVBool_tu12__SVUint64_tl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8i16.nxv2i64(<vscale x 8 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[OFFSET:%.*]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP1]]
+//
+svint16_t test_svld1q_gather_u64base_offset_s16(svbool_t pg, svuint64_t base, int64_t offset)
+{
+ return SVE_ACLE_FUNC(svld1q_gather,_u64base,_offset_s16,)(pg, base, offset);
+}
+
+// CHECK-LABEL: @test_svld1q_gather_u64base_offset_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv16i8.nxv2i64(<vscale x 16 x i1> [[PG:%.*]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[OFFSET:%.*]])
+// CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z36test_svld1q_gather_u64base_offset_u8u10__SVBool_tu12__SVUint64_tl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv16i8.nxv2i64(<vscale x 16 x i1> [[PG:%.*]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[OFFSET:%.*]])
+// CPP-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+//
+svuint8_t test_svld1q_gather_u64base_offset_u8(svbool_t pg, svuint64_t base, int64_t offset)
+{
+ return SVE_ACLE_FUNC(svld1q_gather,_u64base,_offset_u8,)(pg, base, offset);
+}
+
+// CHECK-LABEL: @test_svld1q_gather_u64base_offset_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv16i8.nxv2i64(<vscale x 16 x i1> [[PG:%.*]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[OFFSET:%.*]])
+// CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z36test_svld1q_gather_u64base_offset_s8u10__SVBool_tu12__SVUint64_tl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv16i8.nxv2i64(<vscale x 16 x i1> [[PG:%.*]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[OFFSET:%.*]])
+// CPP-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+//
+svint8_t test_svld1q_gather_u64base_offset_s8(svbool_t pg, svuint64_t base, int64_t offset)
+{
+ return SVE_ACLE_FUNC(svld1q_gather,_u64base,_offset_s8,)(pg, base, offset);
+}
+
+// CHECK-LABEL: @test_svld1q_gather_u64base_offset_f64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv2f64.nxv2i64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[OFFSET:%.*]])
+// CHECK-NEXT: ret <vscale x 2 x double> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z37test_svld1q_gather_u64base_offset_f64u10__SVBool_tu12__SVUint64_tl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv2f64.nxv2i64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[OFFSET:%.*]])
+// CPP-CHECK-NEXT: ret <vscale x 2 x double> [[TMP1]]
+//
+svfloat64_t test_svld1q_gather_u64base_offset_f64(svbool_t pg, svuint64_t base, int64_t offset)
+{
+ return SVE_ACLE_FUNC(svld1q_gather,_u64base,_offset_f64,)(pg, base, offset);
+}
+
+// CHECK-LABEL: @test_svld1q_gather_u64base_offset_f32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv4f32.nxv2i64(<vscale x 4 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[OFFSET:%.*]])
+// CHECK-NEXT: ret <vscale x 4 x float> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z37test_svld1q_gather_u64base_offset_f32u10__SVBool_tu12__SVUint64_tl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv4f32.nxv2i64(<vscale x 4 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[OFFSET:%.*]])
+// CPP-CHECK-NEXT: ret <vscale x 4 x float> [[TMP1]]
+//
+svfloat32_t test_svld1q_gather_u64base_offset_f32(svbool_t pg, svuint64_t base, int64_t offset)
+{
+ return SVE_ACLE_FUNC(svld1q_gather,_u64base,_offset_f32,)(pg, base, offset);
+}
+
+// CHECK-LABEL: @test_svld1q_gather_u64base_offset_f16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8f16.nxv2i64(<vscale x 8 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[OFFSET:%.*]])
+// CHECK-NEXT: ret <vscale x 8 x half> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z37test_svld1q_gather_u64base_offset_f16u10__SVBool_tu12__SVUint64_tl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8f16.nxv2i64(<vscale x 8 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[OFFSET:%.*]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x half> [[TMP1]]
+//
+svfloat16_t test_svld1q_gather_u64base_offset_f16(svbool_t pg, svuint64_t base, int64_t offset)
+{
+ return SVE_ACLE_FUNC(svld1q_gather,_u64base,_offset_f16,)(pg, base, offset);
+}
+
+// CHECK-LABEL: @test_svld1q_gather_u64base_offset_bf16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8bf16.nxv2i64(<vscale x 8 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[OFFSET:%.*]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z38test_svld1q_gather_u64base_offset_bf16u10__SVBool_tu12__SVUint64_tl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8bf16.nxv2i64(<vscale x 8 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[OFFSET:%.*]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svld1q_gather_u64base_offset_bf16(svbool_t pg, svuint64_t base, int64_t offset)
+{
+ return SVE_ACLE_FUNC(svld1q_gather,_u64base,_offset_bf16,)(pg, base, offset);
+}
+
+// Vector base and no offset
+// CHECK-LABEL: @test_svld1q_gather_u64base_u64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 0)
+// CHECK-NEXT: ret <vscale x 2 x i64> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z30test_svld1q_gather_u64base_u64u10__SVBool_tu12__SVUint64_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 0)
+// CPP-CHECK-NEXT: ret <vscale x 2 x i64> [[TMP1]]
+//
+svuint64_t test_svld1q_gather_u64base_u64(svbool_t pg, svuint64_t base)
+{
+ return SVE_ACLE_FUNC(svld1q_gather,_u64base,_u64,)(pg, base);
+}
+
+// CHECK-LABEL: @test_svld1q_gather_u64base_s64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 0)
+// CHECK-NEXT: ret <vscale x 2 x i64> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z30test_svld1q_gather_u64base_s64u10__SVBool_tu12__SVUint64_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 0)
+// CPP-CHECK-NEXT: ret <vscale x 2 x i64> [[TMP1]]
+//
+svint64_t test_svld1q_gather_u64base_s64(svbool_t pg, svuint64_t base)
+{
+ return SVE_ACLE_FUNC(svld1q_gather,_u64base,_s64,)(pg, base);
+}
+
+// CHECK-LABEL: @test_svld1q_gather_u64base_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv4i32.nxv2i64(<vscale x 4 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 0)
+// CHECK-NEXT: ret <vscale x 4 x i32> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z30test_svld1q_gather_u64base_u32u10__SVBool_tu12__SVUint64_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv4i32.nxv2i64(<vscale x 4 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 0)
+// CPP-CHECK-NEXT: ret <vscale x 4 x i32> [[TMP1]]
+//
+svuint32_t test_svld1q_gather_u64base_u32(svbool_t pg, svuint64_t base)
+{
+ return SVE_ACLE_FUNC(svld1q_gather,_u64base,_u32,)(pg, base);
+}
+
+// CHECK-LABEL: @test_svld1q_gather_u64base_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv4i32.nxv2i64(<vscale x 4 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 0)
+// CHECK-NEXT: ret <vscale x 4 x i32> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z30test_svld1q_gather_u64base_s32u10__SVBool_tu12__SVUint64_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv4i32.nxv2i64(<vscale x 4 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 0)
+// CPP-CHECK-NEXT: ret <vscale x 4 x i32> [[TMP1]]
+//
+svint32_t test_svld1q_gather_u64base_s32(svbool_t pg, svuint64_t base)
+{
+ return SVE_ACLE_FUNC(svld1q_gather,_u64base,_s32,)(pg, base);
+}
+
+// CHECK-LABEL: @test_svld1q_gather_u64base_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8i16.nxv2i64(<vscale x 8 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 0)
+// CHECK-NEXT: ret <vscale x 8 x i16> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z30test_svld1q_gather_u64base_u16u10__SVBool_tu12__SVUint64_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8i16.nxv2i64(<vscale x 8 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 0)
+// CPP-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP1]]
+//
+svuint16_t test_svld1q_gather_u64base_u16(svbool_t pg, svuint64_t base)
+{
+ return SVE_ACLE_FUNC(svld1q_gather,_u64base,_u16,)(pg, base);
+}
+
+// CHECK-LABEL: @test_svld1q_gather_u64base_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8i16.nxv2i64(<vscale x 8 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 0)
+// CHECK-NEXT: ret <vscale x 8 x i16> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z30test_svld1q_gather_u64base_s16u10__SVBool_tu12__SVUint64_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8i16.nxv2i64(<vscale x 8 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 0)
+// CPP-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP1]]
+//
+svint16_t test_svld1q_gather_u64base_s16(svbool_t pg, svuint64_t base)
+{
+ return SVE_ACLE_FUNC(svld1q_gather,_u64base,_s16,)(pg, base);
+}
+
+// CHECK-LABEL: @test_svld1q_gather_u64base_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv16i8.nxv2i64(<vscale x 16 x i1> [[PG:%.*]], <vscale x 2 x i64> [[BASE:%.*]], i64 0)
+// CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z29test_svld1q_gather_u64base_u8u10__SVBool_tu12__SVUint64_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv16i8.nxv2i64(<vscale x 16 x i1> [[PG:%.*]], <vscale x 2 x i64> [[BASE:%.*]], i64 0)
+// CPP-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+//
+svuint8_t test_svld1q_gather_u64base_u8(svbool_t pg, svuint64_t base)
+{
+ return SVE_ACLE_FUNC(svld1q_gather,_u64base,_u8,)(pg, base);
+}
+
+// CHECK-LABEL: @test_svld1q_gather_u64base_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv16i8.nxv2i64(<vscale x 16 x i1> [[PG:%.*]], <vscale x 2 x i64> [[BASE:%.*]], i64 0)
+// CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z29test_svld1q_gather_u64base_s8u10__SVBool_tu12__SVUint64_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv16i8.nxv2i64(<vscale x 16 x i1> [[PG:%.*]], <vscale x 2 x i64> [[BASE:%.*]], i64 0)
+// CPP-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+//
+svint8_t test_svld1q_gather_u64base_s8(svbool_t pg, svuint64_t base)
+{
+ return SVE_ACLE_FUNC(svld1q_gather,_u64base,_s8,)(pg, base);
+}
+
+// CHECK-LABEL: @test_svld1q_gather_u64base_f64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv2f64.nxv2i64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 0)
+// CHECK-NEXT: ret <vscale x 2 x double> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z30test_svld1q_gather_u64base_f64u10__SVBool_tu12__SVUint64_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv2f64.nxv2i64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 0)
+// CPP-CHECK-NEXT: ret <vscale x 2 x double> [[TMP1]]
+//
+svfloat64_t test_svld1q_gather_u64base_f64(svbool_t pg, svuint64_t base)
+{
+ return SVE_ACLE_FUNC(svld1q_gather,_u64base,_f64,)(pg, base);
+}
+
+// CHECK-LABEL: @test_svld1q_gather_u64base_f32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv4f32.nxv2i64(<vscale x 4 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 0)
+// CHECK-NEXT: ret <vscale x 4 x float> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z30test_svld1q_gather_u64base_f32u10__SVBool_tu12__SVUint64_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv4f32.nxv2i64(<vscale x 4 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 0)
+// CPP-CHECK-NEXT: ret <vscale x 4 x float> [[TMP1]]
+//
+svfloat32_t test_svld1q_gather_u64base_f32(svbool_t pg, svuint64_t base)
+{
+ return SVE_ACLE_FUNC(svld1q_gather,_u64base,_f32,)(pg, base);
+}
+
+// CHECK-LABEL: @test_svld1q_gather_u64base_f16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8f16.nxv2i64(<vscale x 8 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 0)
+// CHECK-NEXT: ret <vscale x 8 x half> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z30test_svld1q_gather_u64base_f16u10__SVBool_tu12__SVUint64_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8f16.nxv2i64(<vscale x 8 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 0)
+// CPP-CHECK-NEXT: ret <vscale x 8 x half> [[TMP1]]
+//
+svfloat16_t test_svld1q_gather_u64base_f16(svbool_t pg, svuint64_t base)
+{
+ return SVE_ACLE_FUNC(svld1q_gather,_u64base,_f16,)(pg, base);
+}
+
+// CHECK-LABEL: @test_svld1q_gather_u64base_bf16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8bf16.nxv2i64(<vscale x 8 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 0)
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z31test_svld1q_gather_u64base_bf16u10__SVBool_tu12__SVUint64_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8bf16.nxv2i64(<vscale x 8 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 0)
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svld1q_gather_u64base_bf16(svbool_t pg, svuint64_t base)
+{
+ return SVE_ACLE_FUNC(svld1q_gather,_u64base,_bf16,)(pg, base);
+}
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_st1_single.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_st1_single.c
new file mode 100644
index 000000000000000..52c16faec7f32b2
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_st1_single.c
@@ -0,0 +1,255 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 \
+// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 \
+// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 \
+// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 \
+// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 \
+// RUN: -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4
+#endif
+
+// ST1W
+
+// CHECK-LABEL: define dso_local void @test_svst1uwq_u32
+// CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]], <vscale x 4 x i32> [[ZT:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1uwq.nxv4i32(<vscale x 4 x i32> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z17test_svst1uwq_u32u10__SVBool_tPKju12__SVUint32_t
+// CPP-CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]], <vscale x 4 x i32> [[ZT:%.*]]) #[[ATTR0:[0-9]+]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1uwq.nxv4i32(<vscale x 4 x i32> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst1uwq_u32(svbool_t pred, uint32_t const * base, svuint32_t zt) {
+ SVE_ACLE_FUNC(svst1uwq, _u32, , )(pred, base, zt);
+}
+
+// CHECK-LABEL: define dso_local void @test_svst1uwq_vnum_u32
+// CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]], <vscale x 4 x i32> [[ZT:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 1 x i32>, ptr [[BASE]], i64 1
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1uwq.nxv4i32(<vscale x 4 x i32> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z22test_svst1uwq_vnum_u32u10__SVBool_tPKju12__SVUint32_t
+// CPP-CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]], <vscale x 4 x i32> [[ZT:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 1 x i32>, ptr [[BASE]], i64 1
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1uwq.nxv4i32(<vscale x 4 x i32> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst1uwq_vnum_u32(svbool_t pred, uint32_t const * base, svuint32_t zt) {
+ SVE_ACLE_FUNC(svst1uwq_vnum, _u32, , )(pred, base, 1, zt);
+}
+
+// CHECK-LABEL: define dso_local void @test_svst1uwq_s32
+// CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]], <vscale x 4 x i32> [[ZT:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1uwq.nxv4i32(<vscale x 4 x i32> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z17test_svst1uwq_s32u10__SVBool_tPKiu11__SVInt32_t
+// CPP-CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]], <vscale x 4 x i32> [[ZT:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1uwq.nxv4i32(<vscale x 4 x i32> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst1uwq_s32(svbool_t pred, int32_t const * base, svint32_t zt) {
+ SVE_ACLE_FUNC(svst1uwq, _s32, , )(pred, base, zt);
+}
+
+// CHECK-LABEL: define dso_local void @test_svst1uwq_vnum_s32
+// CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]], <vscale x 4 x i32> [[ZT:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 1 x i32>, ptr [[BASE]], i64 1
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1uwq.nxv4i32(<vscale x 4 x i32> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z22test_svst1uwq_vnum_s32u10__SVBool_tPKiu11__SVInt32_t
+// CPP-CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]], <vscale x 4 x i32> [[ZT:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 1 x i32>, ptr [[BASE]], i64 1
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1uwq.nxv4i32(<vscale x 4 x i32> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst1uwq_vnum_s32(svbool_t pred, int32_t const * base, svint32_t zt) {
+ SVE_ACLE_FUNC(svst1uwq_vnum, _s32, , )(pred, base, 1, zt);
+}
+
+// CHECK-LABEL: define dso_local void @test_svst1uwq_f32
+// CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]], <vscale x 4 x float> [[ZT:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1uwq.nxv4f32(<vscale x 4 x float> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z17test_svst1uwq_f32u10__SVBool_tPKfu13__SVFloat32_t
+// CPP-CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]], <vscale x 4 x float> [[ZT:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1uwq.nxv4f32(<vscale x 4 x float> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst1uwq_f32(svbool_t pred, float32_t const * base, svfloat32_t zt) {
+ SVE_ACLE_FUNC(svst1uwq, _f32, , )(pred, base, zt);
+}
+
+// CHECK-LABEL: define dso_local void @test_svst1uwq_vnum_f32
+// CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]], <vscale x 4 x float> [[ZT:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 1 x float>, ptr [[BASE]], i64 1
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1uwq.nxv4f32(<vscale x 4 x float> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z22test_svst1uwq_vnum_f32u10__SVBool_tPKfu13__SVFloat32_t
+// CPP-CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]], <vscale x 4 x float> [[ZT:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 1 x float>, ptr [[BASE]], i64 1
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1uwq.nxv4f32(<vscale x 4 x float> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst1uwq_vnum_f32(svbool_t pred, float32_t const * base, svfloat32_t zt) {
+ SVE_ACLE_FUNC(svst1uwq_vnum, _f32, , )(pred, base, 1, zt);
+}
+
+
+// ST1D
+
+// CHECK-LABEL: define dso_local void @test_svst1udq_u64
+// CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]], <vscale x 2 x i64> [[ZT:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1udq.nxv2i64(<vscale x 2 x i64> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z17test_svst1udq_u64u10__SVBool_tPKmu12__SVUint64_t
+// CPP-CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]], <vscale x 2 x i64> [[ZT:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1udq.nxv2i64(<vscale x 2 x i64> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst1udq_u64(svbool_t pred, uint64_t const * base, svuint64_t zt) {
+ SVE_ACLE_FUNC(svst1udq, _u64, , )(pred, base, zt);
+}
+
+// CHECK-LABEL: define dso_local void @test_svst1udq_vnum_u64
+// CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]], <vscale x 2 x i64> [[ZT:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 1 x i64>, ptr [[BASE]], i64 -8
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1udq.nxv2i64(<vscale x 2 x i64> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z22test_svst1udq_vnum_u64u10__SVBool_tPKmu12__SVUint64_t
+// CPP-CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]], <vscale x 2 x i64> [[ZT:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 1 x i64>, ptr [[BASE]], i64 -8
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1udq.nxv2i64(<vscale x 2 x i64> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst1udq_vnum_u64(svbool_t pred, uint64_t const * base, svuint64_t zt) {
+ SVE_ACLE_FUNC(svst1udq_vnum, _u64, , )(pred, base, -8, zt);
+}
+
+// CHECK-LABEL: define dso_local void @test_svst1udq_s64
+// CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]], <vscale x 2 x i64> [[ZT:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1udq.nxv2i64(<vscale x 2 x i64> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z17test_svst1udq_s64u10__SVBool_tPKlu11__SVInt64_t
+// CPP-CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]], <vscale x 2 x i64> [[ZT:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1udq.nxv2i64(<vscale x 2 x i64> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst1udq_s64(svbool_t pred, int64_t const * base, svint64_t zt) {
+ SVE_ACLE_FUNC(svst1udq, _s64, , )(pred, base, zt);
+}
+
+// CHECK-LABEL: define dso_local void @test_svst1udq_vnum_s64
+// CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]], <vscale x 2 x i64> [[ZT:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 1 x i64>, ptr [[BASE]], i64 -8
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1udq.nxv2i64(<vscale x 2 x i64> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z22test_svst1udq_vnum_s64u10__SVBool_tPKlu11__SVInt64_t
+// CPP-CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]], <vscale x 2 x i64> [[ZT:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 1 x i64>, ptr [[BASE]], i64 -8
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1udq.nxv2i64(<vscale x 2 x i64> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst1udq_vnum_s64(svbool_t pred, int64_t const * base, svint64_t zt) {
+ SVE_ACLE_FUNC(svst1udq_vnum, _s64, , )(pred, base, -8, zt);
+}
+
+// CHECK-LABEL: define dso_local void @test_svst1udq_f64
+// CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]], <vscale x 2 x double> [[ZT:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1udq.nxv2f64(<vscale x 2 x double> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z17test_svst1udq_f64u10__SVBool_tPKdu13__SVFloat64_t
+// CPP-CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]], <vscale x 2 x double> [[ZT:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1udq.nxv2f64(<vscale x 2 x double> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst1udq_f64(svbool_t pred, float64_t const * base, svfloat64_t zt) {
+ SVE_ACLE_FUNC(svst1udq, _f64, , )(pred, base, zt);
+}
+
+// CHECK-LABEL: define dso_local void @test_svst1udq_vnum_f64
+// CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]], <vscale x 2 x double> [[ZT:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 1 x double>, ptr [[BASE]], i64 -8
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1udq.nxv2f64(<vscale x 2 x double> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z22test_svst1udq_vnum_f64u10__SVBool_tPKdu13__SVFloat64_t
+// CPP-CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]], <vscale x 2 x double> [[ZT:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 1 x double>, ptr [[BASE]], i64 -8
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1udq.nxv2f64(<vscale x 2 x double> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst1udq_vnum_f64(svbool_t pred, float64_t const * base, svfloat64_t zt) {
+ SVE_ACLE_FUNC(svst1udq_vnum, _f64, , )(pred, base, -8, zt);
+}
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_store.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_store.c
new file mode 100644
index 000000000000000..5584fb16379ed9c
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_store.c
@@ -0,0 +1,2124 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
+#endif
+
+// CHECK-LABEL: @test_svst2q_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZT]], i64 16)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i1> [[PG:%.*]], ptr [[BASE:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z14test_svst2q_u8u10__SVBool_tPKh11svuint8x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZT]], i64 16)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i1> [[PG:%.*]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst2q_u8(svbool_t pg, const uint8_t *base, svuint8x2_t zt)
+{
+ SVE_ACLE_FUNC(svst2q,,_u8,)(pg, base, zt);
+}
+
+// CHECK-LABEL: @test_svst2q_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZT]], i64 16)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i1> [[PG:%.*]], ptr [[BASE:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z14test_svst2q_s8u10__SVBool_tPKa10svint8x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZT]], i64 16)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i1> [[PG:%.*]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst2q_s8(svbool_t pg, const int8_t *base, svint8x2_t zt)
+{
+ SVE_ACLE_FUNC(svst2q,,_s8,)(pg, base, zt);
+}
+// CHECK-LABEL: @test_svst2q_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZT]], i64 8)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv8i16(<vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svst2q_u16u10__SVBool_tPKt12svuint16x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZT]], i64 8)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv8i16(<vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst2q_u16(svbool_t pg, const uint16_t *base, svuint16x2_t zt)
+{
+ SVE_ACLE_FUNC(svst2q,,_u16,)(pg, base, zt);
+}
+
+// CHECK-LABEL: @test_svst2q_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZT]], i64 8)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv8i16(<vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svst2q_s16u10__SVBool_tPKs11svint16x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZT]], i64 8)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv8i16(<vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst2q_s16(svbool_t pg, const int16_t *base, svint16x2_t zt)
+{
+ SVE_ACLE_FUNC(svst2q,,_s16,)(pg, base, zt);
+}
+
+// CHECK-LABEL: @test_svst2q_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZT]], i64 4)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv4i32(<vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svst2q_u32u10__SVBool_tPKj12svuint32x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZT]], i64 4)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv4i32(<vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst2q_u32(svbool_t pg, const uint32_t *base, svuint32x2_t zt)
+{
+ SVE_ACLE_FUNC(svst2q,,_u32,)(pg, base, zt);
+}
+
+// CHECK-LABEL: @test_svst2q_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZT]], i64 4)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv4i32(<vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svst2q_s32u10__SVBool_tPKi11svint32x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZT]], i64 4)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv4i32(<vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst2q_s32(svbool_t pg, const int32_t *base, svint32x2_t zt)
+{
+ SVE_ACLE_FUNC(svst2q,,_s32,)(pg, base, zt);
+}
+
+// CHECK-LABEL: @test_svst2q_u64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZT]], i64 2)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv2i64(<vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svst2q_u64u10__SVBool_tPKm12svuint64x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZT]], i64 2)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv2i64(<vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst2q_u64(svbool_t pg, const uint64_t *base, svuint64x2_t zt)
+{
+ SVE_ACLE_FUNC(svst2q,,_u64,)(pg, base, zt);
+}
+
+// CHECK-LABEL: @test_svst2q_s64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZT]], i64 2)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv2i64(<vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svst2q_s64u10__SVBool_tPKl11svint64x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZT]], i64 2)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv2i64(<vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst2q_s64(svbool_t pg, const int64_t *base, svint64x2_t zt)
+{
+ SVE_ACLE_FUNC(svst2q,,_s64,)(pg, base, zt);
+}
+
+// CHECK-LABEL: @test_svst2q_f16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZT]], i64 8)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv8f16(<vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svst2q_f16u10__SVBool_tPKDh13svfloat16x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZT]], i64 8)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv8f16(<vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst2q_f16(svbool_t pg, const float16_t *base, svfloat16x2_t zt)
+{
+ SVE_ACLE_FUNC(svst2q,,_f16,)(pg, base, zt);
+}
+
+// CHECK-LABEL: @test_svst2q_bf16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZT]], i64 8)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv8bf16(<vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z16test_svst2q_bf16u10__SVBool_tPKu6__bf1614svbfloat16x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZT]], i64 8)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv8bf16(<vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst2q_bf16(svbool_t pg, const bfloat16_t *base, svbfloat16x2_t zt)
+{
+ SVE_ACLE_FUNC(svst2q,,_bf16,)(pg, base, zt);
+}
+
+// CHECK-LABEL: @test_svst2q_f32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZT]], i64 4)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv4f32(<vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svst2q_f32u10__SVBool_tPKf13svfloat32x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZT]], i64 4)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv4f32(<vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst2q_f32(svbool_t pg, const float32_t *base, svfloat32x2_t zt)
+{
+ SVE_ACLE_FUNC(svst2q,,_f32,)(pg, base, zt);
+}
+
+// CHECK-LABEL: @test_svst2q_f64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZT]], i64 2)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv2f64(<vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svst2q_f64u10__SVBool_tPKd13svfloat64x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZT]], i64 2)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv2f64(<vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst2q_f64(svbool_t pg, const float64_t *base, svfloat64x2_t zt)
+{
+ SVE_ACLE_FUNC(svst2q,,_f64,)(pg, base, zt);
+}
+
+// CHECK-LABEL: @test_svst2q_vnum_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZT]], i64 16)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv16i8(<vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z19test_svst2q_vnum_u8u10__SVBool_tPKhl11svuint8x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZT]], i64 16)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv16i8(<vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst2q_vnum_u8(svbool_t pg, const uint8_t *base, int64_t vnum, svuint8x2_t zt)
+{
+ SVE_ACLE_FUNC(svst2q_vnum_,,u8,)(pg, base, vnum, zt);
+}
+
+// CHECK-LABEL: @test_svst2q_vnum_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZT]], i64 16)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv16i8(<vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z19test_svst2q_vnum_s8u10__SVBool_tPKal10svint8x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZT]], i64 16)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv16i8(<vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst2q_vnum_s8(svbool_t pg, const int8_t *base, int64_t vnum, svint8x2_t zt)
+{
+ SVE_ACLE_FUNC(svst2q_vnum,,_s8,)(pg, base, vnum, zt);
+}
+
+// CHECK-LABEL: @test_svst2q_vnum_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZT]], i64 8)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv8i16(<vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svst2q_vnum_u16u10__SVBool_tPKtl12svuint16x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZT]], i64 8)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv8i16(<vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst2q_vnum_u16(svbool_t pg, const uint16_t *base, int64_t vnum, svuint16x2_t zt)
+{
+ SVE_ACLE_FUNC(svst2q_vnum,,_u16,)(pg, base, vnum, zt);
+}
+
+// CHECK-LABEL: @test_svst2q_vnum_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZT]], i64 8)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv8i16(<vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svst2q_vnum_s16u10__SVBool_tPKsl11svint16x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZT]], i64 8)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv8i16(<vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst2q_vnum_s16(svbool_t pg, const int16_t *base, int64_t vnum, svint16x2_t zt)
+{
+ SVE_ACLE_FUNC(svst2q_vnum,,_s16,)(pg, base, vnum, zt);
+}
+
+// CHECK-LABEL: @test_svst2q_vnum_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZT]], i64 4)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv4i32(<vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svst2q_vnum_u32u10__SVBool_tPKjl12svuint32x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZT]], i64 4)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv4i32(<vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst2q_vnum_u32(svbool_t pg, const uint32_t *base, int64_t vnum, svuint32x2_t zt)
+{
+ SVE_ACLE_FUNC(svst2q_vnum,,_u32,)(pg, base, vnum, zt);
+}
+
+// CHECK-LABEL: @test_svst2q_vnum_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZT]], i64 4)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv4i32(<vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svst2q_vnum_s32u10__SVBool_tPKil11svint32x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZT]], i64 4)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv4i32(<vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst2q_vnum_s32(svbool_t pg, const int32_t *base, int64_t vnum, svint32x2_t zt)
+{
+ SVE_ACLE_FUNC(svst2q_vnum,,_s32,)(pg, base, vnum, zt);
+}
+
+// CHECK-LABEL: @test_svst2q_vnum_u64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZT]], i64 2)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svst2q_vnum_u64u10__SVBool_tPKml12svuint64x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZT]], i64 2)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst2q_vnum_u64(svbool_t pg, const uint64_t *base, int64_t vnum, svuint64x2_t zt)
+{
+ SVE_ACLE_FUNC(svst2q_vnum,,_u64,)(pg, base, vnum, zt);
+}
+
+// CHECK-LABEL: @test_svst2q_vnum_s64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZT]], i64 2)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svst2q_vnum_s64u10__SVBool_tPKll11svint64x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZT]], i64 2)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst2q_vnum_s64(svbool_t pg, const int64_t *base, int64_t vnum, svint64x2_t zt)
+{
+ SVE_ACLE_FUNC(svst2q_vnum,,_s64,)(pg, base, vnum, zt);
+}
+
+// CHECK-LABEL: @test_svst2q_vnum_f16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZT]], i64 8)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv8f16(<vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svst2q_vnum_f16u10__SVBool_tPKDhl13svfloat16x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZT]], i64 8)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv8f16(<vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst2q_vnum_f16(svbool_t pg, const float16_t *base, int64_t vnum, svfloat16x2_t zt)
+{
+ SVE_ACLE_FUNC(svst2q_vnum,,_f16,)(pg, base, vnum, zt);
+}
+
+// CHECK-LABEL: @test_svst2q_vnum_bf16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZT]], i64 8)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv8bf16(<vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_svst2q_vnum_bf16u10__SVBool_tPKu6__bf16l14svbfloat16x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZT]], i64 8)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv8bf16(<vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst2q_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum, svbfloat16x2_t zt)
+{
+ SVE_ACLE_FUNC(svst2q_vnum,,_bf16,)(pg, base, vnum, zt);
+}
+
+// CHECK-LABEL: @test_svst2q_vnum_f32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZT]], i64 4)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv4f32(<vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svst2q_vnum_f32u10__SVBool_tPKfl13svfloat32x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZT]], i64 4)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv4f32(<vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst2q_vnum_f32(svbool_t pg, const float32_t *base, int64_t vnum, svfloat32x2_t zt)
+{
+ SVE_ACLE_FUNC(svst2q_vnum,,_f32,)(pg, base, vnum, zt);
+}
+
+// CHECK-LABEL: @test_svst2q_vnum_f64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZT]], i64 2)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv2f64(<vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svst2q_vnum_f64u10__SVBool_tPKdl13svfloat64x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZT]], i64 2)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv2f64(<vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst2q_vnum_f64(svbool_t pg, const float64_t *base, int64_t vnum, svfloat64x2_t zt)
+{
+ SVE_ACLE_FUNC(svst2q_vnum,,_f64,)(pg, base, vnum, zt);
+}
+
+//
+// ST3Q
+// CHECK-LABEL: @test_svst3q_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[ZT]], i64 16)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[ZT]], i64 32)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i1> [[PG:%.*]], ptr [[BASE:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z14test_svst3q_u8u10__SVBool_tPKh11svuint8x3_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[ZT]], i64 16)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[ZT]], i64 32)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i1> [[PG:%.*]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst3q_u8(svbool_t pg, const uint8_t *base, svuint8x3_t zt)
+{
+ SVE_ACLE_FUNC(svst3q,,_u8,)(pg, base, zt);
+}
+
+// CHECK-LABEL: @test_svst3q_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[ZT]], i64 16)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[ZT]], i64 32)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i1> [[PG:%.*]], ptr [[BASE:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z14test_svst3q_s8u10__SVBool_tPKa10svint8x3_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[ZT]], i64 16)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[ZT]], i64 32)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i1> [[PG:%.*]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst3q_s8(svbool_t pg, const int8_t *base, svint8x3_t zt)
+{
+ SVE_ACLE_FUNC(svst3q,,_s8,)(pg, base, zt);
+}
+// CHECK-LABEL: @test_svst3q_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[ZT]], i64 8)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[ZT]], i64 16)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv8i16(<vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svst3q_u16u10__SVBool_tPKt12svuint16x3_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[ZT]], i64 8)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[ZT]], i64 16)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv8i16(<vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst3q_u16(svbool_t pg, const uint16_t *base, svuint16x3_t zt)
+{
+ SVE_ACLE_FUNC(svst3q,,_u16,)(pg, base, zt);
+}
+
+// CHECK-LABEL: @test_svst3q_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[ZT]], i64 8)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[ZT]], i64 16)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv8i16(<vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svst3q_s16u10__SVBool_tPKs11svint16x3_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[ZT]], i64 8)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[ZT]], i64 16)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv8i16(<vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst3q_s16(svbool_t pg, const int16_t *base, svint16x3_t zt)
+{
+ SVE_ACLE_FUNC(svst3q,,_s16,)(pg, base, zt);
+}
+
+// CHECK-LABEL: @test_svst3q_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[ZT]], i64 4)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[ZT]], i64 8)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv4i32(<vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svst3q_u32u10__SVBool_tPKj12svuint32x3_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[ZT]], i64 4)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[ZT]], i64 8)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv4i32(<vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst3q_u32(svbool_t pg, const uint32_t *base, svuint32x3_t zt)
+{
+ SVE_ACLE_FUNC(svst3q,,_u32,)(pg, base, zt);
+}
+
+// CHECK-LABEL: @test_svst3q_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[ZT]], i64 4)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[ZT]], i64 8)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv4i32(<vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svst3q_s32u10__SVBool_tPKi11svint32x3_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[ZT]], i64 4)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[ZT]], i64 8)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv4i32(<vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst3q_s32(svbool_t pg, const int32_t *base, svint32x3_t zt)
+{
+ SVE_ACLE_FUNC(svst3q,,_s32,)(pg, base, zt);
+}
+
+// CHECK-LABEL: @test_svst3q_u64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[ZT]], i64 2)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[ZT]], i64 4)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv2i64(<vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svst3q_u64u10__SVBool_tPKm12svuint64x3_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[ZT]], i64 2)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[ZT]], i64 4)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv2i64(<vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst3q_u64(svbool_t pg, const uint64_t *base, svuint64x3_t zt)
+{
+ SVE_ACLE_FUNC(svst3q,,_u64,)(pg, base, zt);
+}
+
+// CHECK-LABEL: @test_svst3q_s64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[ZT]], i64 2)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[ZT]], i64 4)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv2i64(<vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svst3q_s64u10__SVBool_tPKl11svint64x3_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[ZT]], i64 2)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[ZT]], i64 4)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv2i64(<vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst3q_s64(svbool_t pg, const int64_t *base, svint64x3_t zt)
+{
+ SVE_ACLE_FUNC(svst3q,,_s64,)(pg, base, zt);
+}
+
+// CHECK-LABEL: @test_svst3q_f16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[ZT]], i64 8)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[ZT]], i64 16)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv8f16(<vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svst3q_f16u10__SVBool_tPKDh13svfloat16x3_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[ZT]], i64 8)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[ZT]], i64 16)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv8f16(<vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst3q_f16(svbool_t pg, const float16_t *base, svfloat16x3_t zt)
+{
+ SVE_ACLE_FUNC(svst3q,,_f16,)(pg, base, zt);
+}
+
+// CHECK-LABEL: @test_svst3q_bf16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[ZT]], i64 8)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[ZT]], i64 16)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv8bf16(<vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z16test_svst3q_bf16u10__SVBool_tPKu6__bf1614svbfloat16x3_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[ZT]], i64 8)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[ZT]], i64 16)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv8bf16(<vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst3q_bf16(svbool_t pg, const bfloat16_t *base, svbfloat16x3_t zt)
+{
+ SVE_ACLE_FUNC(svst3q,,_bf16,)(pg, base, zt);
+}
+
+// CHECK-LABEL: @test_svst3q_f32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[ZT]], i64 4)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[ZT]], i64 8)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv4f32(<vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svst3q_f32u10__SVBool_tPKf13svfloat32x3_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[ZT]], i64 4)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[ZT]], i64 8)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv4f32(<vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst3q_f32(svbool_t pg, const float32_t *base, svfloat32x3_t zt)
+{
+ SVE_ACLE_FUNC(svst3q,,_f32,)(pg, base, zt);
+}
+
+// CHECK-LABEL: @test_svst3q_f64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[ZT]], i64 2)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[ZT]], i64 4)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv2f64(<vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svst3q_f64u10__SVBool_tPKd13svfloat64x3_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[ZT]], i64 2)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[ZT]], i64 4)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv2f64(<vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst3q_f64(svbool_t pg, const float64_t *base, svfloat64x3_t zt)
+{
+ SVE_ACLE_FUNC(svst3q,,_f64,)(pg, base, zt);
+}
+
+// CHECK-LABEL: @test_svst3q_vnum_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[ZT]], i64 16)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[ZT]], i64 32)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv16i8(<vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z19test_svst3q_vnum_u8u10__SVBool_tPKhl11svuint8x3_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[ZT]], i64 16)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[ZT]], i64 32)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv16i8(<vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst3q_vnum_u8(svbool_t pg, const uint8_t *base, int64_t vnum, svuint8x3_t zt)
+{
+ SVE_ACLE_FUNC(svst3q_vnum_,,u8,)(pg, base, vnum, zt);
+}
+
+// CHECK-LABEL: @test_svst3q_vnum_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[ZT]], i64 16)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[ZT]], i64 32)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv16i8(<vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z19test_svst3q_vnum_s8u10__SVBool_tPKal10svint8x3_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[ZT]], i64 16)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[ZT]], i64 32)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv16i8(<vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst3q_vnum_s8(svbool_t pg, const int8_t *base, int64_t vnum, svint8x3_t zt)
+{
+ SVE_ACLE_FUNC(svst3q_vnum,,_s8,)(pg, base, vnum, zt);
+}
+
+// CHECK-LABEL: @test_svst3q_vnum_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[ZT]], i64 8)
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[ZT]], i64 16)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv8i16(<vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svst3q_vnum_u16u10__SVBool_tPKtl12svuint16x3_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[ZT]], i64 8)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[ZT]], i64 16)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv8i16(<vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst3q_vnum_u16(svbool_t pg, const uint16_t *base, int64_t vnum, svuint16x3_t zt)
+{
+ SVE_ACLE_FUNC(svst3q_vnum,,_u16,)(pg, base, vnum, zt);
+}
+
+// CHECK-LABEL: @test_svst3q_vnum_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[ZT]], i64 8)
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[ZT]], i64 16)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv8i16(<vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svst3q_vnum_s16u10__SVBool_tPKsl11svint16x3_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[ZT]], i64 8)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[ZT]], i64 16)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv8i16(<vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst3q_vnum_s16(svbool_t pg, const int16_t *base, int64_t vnum, svint16x3_t zt)
+{
+ SVE_ACLE_FUNC(svst3q_vnum,,_s16,)(pg, base, vnum, zt);
+}
+
+// CHECK-LABEL: @test_svst3q_vnum_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[ZT]], i64 4)
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[ZT]], i64 8)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv4i32(<vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svst3q_vnum_u32u10__SVBool_tPKjl12svuint32x3_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[ZT]], i64 4)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[ZT]], i64 8)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv4i32(<vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst3q_vnum_u32(svbool_t pg, const uint32_t *base, int64_t vnum, svuint32x3_t zt)
+{
+ SVE_ACLE_FUNC(svst3q_vnum,,_u32,)(pg, base, vnum, zt);
+}
+
+// CHECK-LABEL: @test_svst3q_vnum_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[ZT]], i64 4)
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[ZT]], i64 8)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv4i32(<vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svst3q_vnum_s32u10__SVBool_tPKil11svint32x3_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[ZT]], i64 4)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[ZT]], i64 8)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv4i32(<vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst3q_vnum_s32(svbool_t pg, const int32_t *base, int64_t vnum, svint32x3_t zt)
+{
+ SVE_ACLE_FUNC(svst3q_vnum,,_s32,)(pg, base, vnum, zt);
+}
+
+// CHECK-LABEL: @test_svst3q_vnum_u64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[ZT]], i64 2)
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[ZT]], i64 4)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svst3q_vnum_u64u10__SVBool_tPKml12svuint64x3_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[ZT]], i64 2)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[ZT]], i64 4)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst3q_vnum_u64(svbool_t pg, const uint64_t *base, int64_t vnum, svuint64x3_t zt)
+{
+ SVE_ACLE_FUNC(svst3q_vnum,,_u64,)(pg, base, vnum, zt);
+}
+
+// CHECK-LABEL: @test_svst3q_vnum_s64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[ZT]], i64 2)
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[ZT]], i64 4)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svst3q_vnum_s64u10__SVBool_tPKll11svint64x3_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[ZT]], i64 2)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[ZT]], i64 4)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst3q_vnum_s64(svbool_t pg, const int64_t *base, int64_t vnum, svint64x3_t zt)
+{
+ SVE_ACLE_FUNC(svst3q_vnum,,_s64,)(pg, base, vnum, zt);
+}
+
+// CHECK-LABEL: @test_svst3q_vnum_f16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[ZT]], i64 8)
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[ZT]], i64 16)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv8f16(<vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svst3q_vnum_f16u10__SVBool_tPKDhl13svfloat16x3_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[ZT]], i64 8)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[ZT]], i64 16)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv8f16(<vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst3q_vnum_f16(svbool_t pg, const float16_t *base, int64_t vnum, svfloat16x3_t zt)
+{
+ SVE_ACLE_FUNC(svst3q_vnum,,_f16,)(pg, base, vnum, zt);
+}
+
+// CHECK-LABEL: @test_svst3q_vnum_bf16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[ZT]], i64 8)
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[ZT]], i64 16)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv8bf16(<vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_svst3q_vnum_bf16u10__SVBool_tPKu6__bf16l14svbfloat16x3_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[ZT]], i64 8)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[ZT]], i64 16)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv8bf16(<vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst3q_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum, svbfloat16x3_t zt)
+{
+ SVE_ACLE_FUNC(svst3q_vnum,,_bf16,)(pg, base, vnum, zt);
+}
+
+// CHECK-LABEL: @test_svst3q_vnum_f32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[ZT]], i64 4)
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[ZT]], i64 8)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv4f32(<vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svst3q_vnum_f32u10__SVBool_tPKfl13svfloat32x3_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[ZT]], i64 4)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[ZT]], i64 8)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv4f32(<vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst3q_vnum_f32(svbool_t pg, const float32_t *base, int64_t vnum, svfloat32x3_t zt)
+{
+ SVE_ACLE_FUNC(svst3q_vnum,,_f32,)(pg, base, vnum, zt);
+}
+
+// CHECK-LABEL: @test_svst3q_vnum_f64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[ZT]], i64 2)
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[ZT]], i64 4)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv2f64(<vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svst3q_vnum_f64u10__SVBool_tPKdl13svfloat64x3_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[ZT]], i64 2)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[ZT]], i64 4)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv2f64(<vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst3q_vnum_f64(svbool_t pg, const float64_t *base, int64_t vnum, svfloat64x3_t zt)
+{
+ SVE_ACLE_FUNC(svst3q_vnum,,_f64,)(pg, base, vnum, zt);
+}
+
+//
+// ST4Q
+// CHECK-LABEL: @test_svst4q_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZT]], i64 16)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZT]], i64 32)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZT]], i64 48)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i1> [[PG:%.*]], ptr [[BASE:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z14test_svst4q_u8u10__SVBool_tPKh11svuint8x4_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZT]], i64 16)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZT]], i64 32)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZT]], i64 48)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i1> [[PG:%.*]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst4q_u8(svbool_t pg, const uint8_t *base, svuint8x4_t zt)
+{
+ SVE_ACLE_FUNC(svst4q,,_u8,)(pg, base, zt);
+}
+
+// CHECK-LABEL: @test_svst4q_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZT]], i64 16)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZT]], i64 32)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZT]], i64 48)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i1> [[PG:%.*]], ptr [[BASE:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z14test_svst4q_s8u10__SVBool_tPKa10svint8x4_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZT]], i64 16)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZT]], i64 32)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZT]], i64 48)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i1> [[PG:%.*]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst4q_s8(svbool_t pg, const int8_t *base, svint8x4_t zt)
+{
+ SVE_ACLE_FUNC(svst4q,,_s8,)(pg, base, zt);
+}
+// CHECK-LABEL: @test_svst4q_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZT]], i64 8)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZT]], i64 16)
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZT]], i64 24)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv8i16(<vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svst4q_u16u10__SVBool_tPKt12svuint16x4_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZT]], i64 8)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZT]], i64 16)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZT]], i64 24)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv8i16(<vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst4q_u16(svbool_t pg, const uint16_t *base, svuint16x4_t zt)
+{
+ SVE_ACLE_FUNC(svst4q,,_u16,)(pg, base, zt);
+}
+
+// CHECK-LABEL: @test_svst4q_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZT]], i64 8)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZT]], i64 16)
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZT]], i64 24)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv8i16(<vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svst4q_s16u10__SVBool_tPKs11svint16x4_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZT]], i64 8)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZT]], i64 16)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZT]], i64 24)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv8i16(<vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst4q_s16(svbool_t pg, const int16_t *base, svint16x4_t zt)
+{
+ SVE_ACLE_FUNC(svst4q,,_s16,)(pg, base, zt);
+}
+
+// CHECK-LABEL: @test_svst4q_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZT]], i64 4)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZT]], i64 8)
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZT]], i64 12)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv4i32(<vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svst4q_u32u10__SVBool_tPKj12svuint32x4_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZT]], i64 4)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZT]], i64 8)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZT]], i64 12)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv4i32(<vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst4q_u32(svbool_t pg, const uint32_t *base, svuint32x4_t zt)
+{
+ SVE_ACLE_FUNC(svst4q,,_u32,)(pg, base, zt);
+}
+
+// CHECK-LABEL: @test_svst4q_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZT]], i64 4)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZT]], i64 8)
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZT]], i64 12)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv4i32(<vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svst4q_s32u10__SVBool_tPKi11svint32x4_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZT]], i64 4)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZT]], i64 8)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZT]], i64 12)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv4i32(<vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst4q_s32(svbool_t pg, const int32_t *base, svint32x4_t zt)
+{
+ SVE_ACLE_FUNC(svst4q,,_s32,)(pg, base, zt);
+}
+
+// CHECK-LABEL: @test_svst4q_u64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZT]], i64 2)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZT]], i64 4)
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZT]], i64 6)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv2i64(<vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svst4q_u64u10__SVBool_tPKm12svuint64x4_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZT]], i64 2)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZT]], i64 4)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZT]], i64 6)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv2i64(<vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst4q_u64(svbool_t pg, const uint64_t *base, svuint64x4_t zt)
+{
+ SVE_ACLE_FUNC(svst4q,,_u64,)(pg, base, zt);
+}
+
+// CHECK-LABEL: @test_svst4q_s64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZT]], i64 2)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZT]], i64 4)
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZT]], i64 6)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv2i64(<vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svst4q_s64u10__SVBool_tPKl11svint64x4_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZT]], i64 2)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZT]], i64 4)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZT]], i64 6)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv2i64(<vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst4q_s64(svbool_t pg, const int64_t *base, svint64x4_t zt)
+{
+ SVE_ACLE_FUNC(svst4q,,_s64,)(pg, base, zt);
+}
+
+// CHECK-LABEL: @test_svst4q_f16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZT]], i64 8)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZT]], i64 16)
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZT]], i64 24)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv8f16(<vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svst4q_f16u10__SVBool_tPKDh13svfloat16x4_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZT]], i64 8)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZT]], i64 16)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZT]], i64 24)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv8f16(<vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst4q_f16(svbool_t pg, const float16_t *base, svfloat16x4_t zt)
+{
+ SVE_ACLE_FUNC(svst4q,,_f16,)(pg, base, zt);
+}
+
+// CHECK-LABEL: @test_svst4q_bf16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZT]], i64 8)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZT]], i64 16)
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZT]], i64 24)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv8bf16(<vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z16test_svst4q_bf16u10__SVBool_tPKu6__bf1614svbfloat16x4_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZT]], i64 8)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZT]], i64 16)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZT]], i64 24)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv8bf16(<vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst4q_bf16(svbool_t pg, const bfloat16_t *base, svbfloat16x4_t zt)
+{
+ SVE_ACLE_FUNC(svst4q,,_bf16,)(pg, base, zt);
+}
+
+// CHECK-LABEL: @test_svst4q_f32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZT]], i64 4)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZT]], i64 8)
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZT]], i64 12)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv4f32(<vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svst4q_f32u10__SVBool_tPKf13svfloat32x4_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZT]], i64 4)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZT]], i64 8)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZT]], i64 12)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv4f32(<vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst4q_f32(svbool_t pg, const float32_t *base, svfloat32x4_t zt)
+{
+ SVE_ACLE_FUNC(svst4q,,_f32,)(pg, base, zt);
+}
+
+// CHECK-LABEL: @test_svst4q_f64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZT]], i64 2)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZT]], i64 4)
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZT]], i64 6)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv2f64(<vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svst4q_f64u10__SVBool_tPKd13svfloat64x4_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZT]], i64 2)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZT]], i64 4)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZT]], i64 6)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv2f64(<vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst4q_f64(svbool_t pg, const float64_t *base, svfloat64x4_t zt)
+{
+ SVE_ACLE_FUNC(svst4q,,_f64,)(pg, base, zt);
+}
+
+// CHECK-LABEL: @test_svst4q_vnum_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZT]], i64 16)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZT]], i64 32)
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZT]], i64 48)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv16i8(<vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z19test_svst4q_vnum_u8u10__SVBool_tPKhl11svuint8x4_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZT]], i64 16)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZT]], i64 32)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZT]], i64 48)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv16i8(<vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst4q_vnum_u8(svbool_t pg, const uint8_t *base, int64_t vnum, svuint8x4_t zt)
+{
+ SVE_ACLE_FUNC(svst4q_vnum_,,u8,)(pg, base, vnum, zt);
+}
+
+// CHECK-LABEL: @test_svst4q_vnum_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZT]], i64 16)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZT]], i64 32)
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZT]], i64 48)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv16i8(<vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z19test_svst4q_vnum_s8u10__SVBool_tPKal10svint8x4_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZT]], i64 16)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZT]], i64 32)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZT]], i64 48)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv16i8(<vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst4q_vnum_s8(svbool_t pg, const int8_t *base, int64_t vnum, svint8x4_t zt)
+{
+ SVE_ACLE_FUNC(svst4q_vnum,,_s8,)(pg, base, vnum, zt);
+}
+
+// CHECK-LABEL: @test_svst4q_vnum_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZT]], i64 8)
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZT]], i64 16)
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZT]], i64 24)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv8i16(<vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svst4q_vnum_u16u10__SVBool_tPKtl12svuint16x4_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZT]], i64 8)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZT]], i64 16)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZT]], i64 24)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv8i16(<vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst4q_vnum_u16(svbool_t pg, const uint16_t *base, int64_t vnum, svuint16x4_t zt)
+{
+ SVE_ACLE_FUNC(svst4q_vnum,,_u16,)(pg, base, vnum, zt);
+}
+
+// CHECK-LABEL: @test_svst4q_vnum_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZT]], i64 8)
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZT]], i64 16)
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZT]], i64 24)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv8i16(<vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svst4q_vnum_s16u10__SVBool_tPKsl11svint16x4_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZT]], i64 8)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZT]], i64 16)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZT]], i64 24)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv8i16(<vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst4q_vnum_s16(svbool_t pg, const int16_t *base, int64_t vnum, svint16x4_t zt)
+{
+ SVE_ACLE_FUNC(svst4q_vnum,,_s16,)(pg, base, vnum, zt);
+}
+
+// CHECK-LABEL: @test_svst4q_vnum_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZT]], i64 4)
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZT]], i64 8)
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZT]], i64 12)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv4i32(<vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svst4q_vnum_u32u10__SVBool_tPKjl12svuint32x4_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZT]], i64 4)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZT]], i64 8)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZT]], i64 12)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv4i32(<vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst4q_vnum_u32(svbool_t pg, const uint32_t *base, int64_t vnum, svuint32x4_t zt)
+{
+ SVE_ACLE_FUNC(svst4q_vnum,,_u32,)(pg, base, vnum, zt);
+}
+
+// CHECK-LABEL: @test_svst4q_vnum_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZT]], i64 4)
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZT]], i64 8)
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZT]], i64 12)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv4i32(<vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svst4q_vnum_s32u10__SVBool_tPKil11svint32x4_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZT]], i64 4)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZT]], i64 8)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZT]], i64 12)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv4i32(<vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst4q_vnum_s32(svbool_t pg, const int32_t *base, int64_t vnum, svint32x4_t zt)
+{
+ SVE_ACLE_FUNC(svst4q_vnum,,_s32,)(pg, base, vnum, zt);
+}
+
+// CHECK-LABEL: @test_svst4q_vnum_u64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZT]], i64 2)
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZT]], i64 4)
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZT]], i64 6)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svst4q_vnum_u64u10__SVBool_tPKml12svuint64x4_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZT]], i64 2)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZT]], i64 4)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZT]], i64 6)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst4q_vnum_u64(svbool_t pg, const uint64_t *base, int64_t vnum, svuint64x4_t zt)
+{
+ SVE_ACLE_FUNC(svst4q_vnum,,_u64,)(pg, base, vnum, zt);
+}
+
+// CHECK-LABEL: @test_svst4q_vnum_s64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZT]], i64 2)
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZT]], i64 4)
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZT]], i64 6)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svst4q_vnum_s64u10__SVBool_tPKll11svint64x4_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZT]], i64 2)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZT]], i64 4)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZT]], i64 6)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst4q_vnum_s64(svbool_t pg, const int64_t *base, int64_t vnum, svint64x4_t zt)
+{
+ SVE_ACLE_FUNC(svst4q_vnum,,_s64,)(pg, base, vnum, zt);
+}
+
+// CHECK-LABEL: @test_svst4q_vnum_f16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZT]], i64 8)
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZT]], i64 16)
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZT]], i64 24)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv8f16(<vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svst4q_vnum_f16u10__SVBool_tPKDhl13svfloat16x4_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZT]], i64 8)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZT]], i64 16)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZT]], i64 24)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv8f16(<vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst4q_vnum_f16(svbool_t pg, const float16_t *base, int64_t vnum, svfloat16x4_t zt)
+{
+ SVE_ACLE_FUNC(svst4q_vnum,,_f16,)(pg, base, vnum, zt);
+}
+
+// CHECK-LABEL: @test_svst4q_vnum_bf16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZT]], i64 8)
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZT]], i64 16)
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZT]], i64 24)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv8bf16(<vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_svst4q_vnum_bf16u10__SVBool_tPKu6__bf16l14svbfloat16x4_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZT]], i64 8)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZT]], i64 16)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZT]], i64 24)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv8bf16(<vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst4q_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum, svbfloat16x4_t zt)
+{
+ SVE_ACLE_FUNC(svst4q_vnum,,_bf16,)(pg, base, vnum, zt);
+}
+
+// CHECK-LABEL: @test_svst4q_vnum_f32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZT]], i64 4)
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZT]], i64 8)
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZT]], i64 12)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv4f32(<vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svst4q_vnum_f32u10__SVBool_tPKfl13svfloat32x4_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZT]], i64 4)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZT]], i64 8)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZT]], i64 12)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv4f32(<vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst4q_vnum_f32(svbool_t pg, const float32_t *base, int64_t vnum, svfloat32x4_t zt)
+{
+ SVE_ACLE_FUNC(svst4q_vnum,,_f32,)(pg, base, vnum, zt);
+}
+
+// CHECK-LABEL: @test_svst4q_vnum_f64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZT:%.*]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZT]], i64 2)
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZT]], i64 4)
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZT]], i64 6)
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv2f64(<vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svst4q_vnum_f64u10__SVBool_tPKdl13svfloat64x4_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZT:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZT]], i64 2)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZT]], i64 4)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZT]], i64 6)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv2f64(<vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst4q_vnum_f64(svbool_t pg, const float64_t *base, int64_t vnum, svfloat64x4_t zt)
+{
+ SVE_ACLE_FUNC(svst4q_vnum,,_f64,)(pg, base, vnum, zt);
+}
+
+// Scatter for 128 bits
+// vector base + scalar offset
+// CHECK-LABEL: @test_svst1q_scatter_u64base_offset_u64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i64> [[DATA:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[OFFSET:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z38test_svst1q_scatter_u64base_offset_u64u10__SVBool_tu12__SVUint64_tlS0_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i64> [[DATA:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[OFFSET:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst1q_scatter_u64base_offset_u64(svbool_t pg, svuint64_t base, int64_t offset, svuint64_t data)
+{
+ SVE_ACLE_FUNC(svst1q_scatter, _u64base, _offset, _u64)(pg, base, offset, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64base_offset_s64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i64> [[DATA:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[OFFSET:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z38test_svst1q_scatter_u64base_offset_s64u10__SVBool_tu12__SVUint64_tlu11__SVInt64_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i64> [[DATA:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[OFFSET:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst1q_scatter_u64base_offset_s64(svbool_t pg, svuint64_t base, int64_t offset, svint64_t data)
+{
+ SVE_ACLE_FUNC(svst1q_scatter, _u64base, _offset, _s64)(pg, base, offset, data);
+}
+
+
+// CHECK-LABEL: @test_svst1q_scatter_u64base_offset_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv4i32.nxv2i64(<vscale x 4 x i32> [[DATA:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[OFFSET:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z38test_svst1q_scatter_u64base_offset_u32u10__SVBool_tu12__SVUint64_tlu12__SVUint32_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv4i32.nxv2i64(<vscale x 4 x i32> [[DATA:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[OFFSET:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst1q_scatter_u64base_offset_u32(svbool_t pg, svuint64_t base, int64_t offset, svuint32_t data)
+{
+ SVE_ACLE_FUNC(svst1q_scatter, _u64base, _offset, _u32)(pg, base, offset, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64base_offset_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv4i32.nxv2i64(<vscale x 4 x i32> [[DATA:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[OFFSET:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z38test_svst1q_scatter_u64base_offset_s32u10__SVBool_tu12__SVUint64_tlu11__SVInt32_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv4i32.nxv2i64(<vscale x 4 x i32> [[DATA:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[OFFSET:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst1q_scatter_u64base_offset_s32(svbool_t pg, svuint64_t base, int64_t offset, svint32_t data)
+{
+ SVE_ACLE_FUNC(svst1q_scatter, _u64base, _offset, _s32)(pg, base, offset, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64base_offset_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8i16.nxv2i64(<vscale x 8 x i16> [[DATA:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[OFFSET:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z38test_svst1q_scatter_u64base_offset_u16u10__SVBool_tu12__SVUint64_tlu12__SVUint16_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8i16.nxv2i64(<vscale x 8 x i16> [[DATA:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[OFFSET:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst1q_scatter_u64base_offset_u16(svbool_t pg, svuint64_t base, int64_t offset, svuint16_t data)
+{
+ SVE_ACLE_FUNC(svst1q_scatter, _u64base, _offset, _u16)(pg, base, offset, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64base_offset_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8i16.nxv2i64(<vscale x 8 x i16> [[DATA:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[OFFSET:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z38test_svst1q_scatter_u64base_offset_s16u10__SVBool_tu12__SVUint64_tlu11__SVInt16_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8i16.nxv2i64(<vscale x 8 x i16> [[DATA:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[OFFSET:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst1q_scatter_u64base_offset_s16(svbool_t pg, svuint64_t base, int64_t offset, svint16_t data)
+{
+ SVE_ACLE_FUNC(svst1q_scatter, _u64base, _offset, _s16)(pg, base, offset, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64base_offset_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv16i8.nxv2i64(<vscale x 16 x i8> [[DATA:%.*]], <vscale x 16 x i1> [[PG:%.*]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[OFFSET:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z37test_svst1q_scatter_u64base_offset_u8u10__SVBool_tu12__SVUint64_tlu11__SVUint8_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv16i8.nxv2i64(<vscale x 16 x i8> [[DATA:%.*]], <vscale x 16 x i1> [[PG:%.*]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[OFFSET:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst1q_scatter_u64base_offset_u8(svbool_t pg, svuint64_t base, int64_t offset, svuint8_t data)
+{
+ SVE_ACLE_FUNC(svst1q_scatter, _u64base, _offset, _u8)(pg, base, offset, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64base_offset_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv16i8.nxv2i64(<vscale x 16 x i8> [[DATA:%.*]], <vscale x 16 x i1> [[PG:%.*]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[OFFSET:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z37test_svst1q_scatter_u64base_offset_s8u10__SVBool_tu12__SVUint64_tlu10__SVInt8_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv16i8.nxv2i64(<vscale x 16 x i8> [[DATA:%.*]], <vscale x 16 x i1> [[PG:%.*]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[OFFSET:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst1q_scatter_u64base_offset_s8(svbool_t pg, svuint64_t base, int64_t offset, svint8_t data)
+{
+ SVE_ACLE_FUNC(svst1q_scatter, _u64base, _offset, _s8)(pg, base, offset, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64base_offset_f64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2f64.nxv2i64(<vscale x 2 x double> [[DATA:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[OFFSET:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z38test_svst1q_scatter_u64base_offset_f64u10__SVBool_tu12__SVUint64_tlu13__SVFloat64_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2f64.nxv2i64(<vscale x 2 x double> [[DATA:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[OFFSET:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst1q_scatter_u64base_offset_f64(svbool_t pg, svuint64_t base, int64_t offset, svfloat64_t data)
+{
+ SVE_ACLE_FUNC(svst1q_scatter, _u64base, _offset, _f64)(pg, base, offset, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64base_offset_f32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv4f32.nxv2i64(<vscale x 4 x float> [[DATA:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[OFFSET:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z38test_svst1q_scatter_u64base_offset_f32u10__SVBool_tu12__SVUint64_tlu13__SVFloat32_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv4f32.nxv2i64(<vscale x 4 x float> [[DATA:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[OFFSET:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst1q_scatter_u64base_offset_f32(svbool_t pg, svuint64_t base, int64_t offset, svfloat32_t data)
+{
+ SVE_ACLE_FUNC(svst1q_scatter, _u64base, _offset, _f32)(pg, base, offset, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64base_offset_f16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8f16.nxv2i64(<vscale x 8 x half> [[DATA:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[OFFSET:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z38test_svst1q_scatter_u64base_offset_f16u10__SVBool_tu12__SVUint64_tlu13__SVFloat16_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8f16.nxv2i64(<vscale x 8 x half> [[DATA:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[OFFSET:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst1q_scatter_u64base_offset_f16(svbool_t pg, svuint64_t base, int64_t offset, svfloat16_t data)
+{
+ SVE_ACLE_FUNC(svst1q_scatter, _u64base, _offset, _f16)(pg, base, offset, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64base_offset_bf16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8bf16.nxv2i64(<vscale x 8 x bfloat> [[DATA:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[OFFSET:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z39test_svst1q_scatter_u64base_offset_bf16u10__SVBool_tu12__SVUint64_tlu14__SVBfloat16_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8bf16.nxv2i64(<vscale x 8 x bfloat> [[DATA:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[OFFSET:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst1q_scatter_u64base_offset_bf16(svbool_t pg, svuint64_t base, int64_t offset, svbfloat16_t data)
+{
+ SVE_ACLE_FUNC(svst1q_scatter, _u64base, _offset, _bf16)(pg, base, offset, data);
+}
+
+// Vector Base and no Offset
+// CHECK-LABEL: @test_svst1q_scatter_u64base_u64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i64> [[DATA:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 0)
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svst1q_scatter_u64base_u64u10__SVBool_tu12__SVUint64_tS0_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i64> [[DATA:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 0)
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst1q_scatter_u64base_u64(svbool_t pg, svuint64_t base, svuint64_t data)
+{
+ SVE_ACLE_FUNC(svst1q_scatter, _u64base,, _u64)(pg, base, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64base_s64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i64> [[DATA:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 0)
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svst1q_scatter_u64base_s64u10__SVBool_tu12__SVUint64_tu11__SVInt64_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i64> [[DATA:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 0)
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst1q_scatter_u64base_s64(svbool_t pg, svuint64_t base, svint64_t data)
+{
+ SVE_ACLE_FUNC(svst1q_scatter, _u64base,, _s64)(pg, base, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64base_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv4i32.nxv2i64(<vscale x 4 x i32> [[DATA:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 0)
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svst1q_scatter_u64base_u32u10__SVBool_tu12__SVUint64_tu12__SVUint32_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv4i32.nxv2i64(<vscale x 4 x i32> [[DATA:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 0)
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst1q_scatter_u64base_u32(svbool_t pg, svuint64_t base, svuint32_t data)
+{
+ SVE_ACLE_FUNC(svst1q_scatter, _u64base,, _u32)(pg, base, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64base_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv4i32.nxv2i64(<vscale x 4 x i32> [[DATA:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 0)
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svst1q_scatter_u64base_s32u10__SVBool_tu12__SVUint64_tu11__SVInt32_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv4i32.nxv2i64(<vscale x 4 x i32> [[DATA:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 0)
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst1q_scatter_u64base_s32(svbool_t pg, svuint64_t base, svint32_t data)
+{
+ SVE_ACLE_FUNC(svst1q_scatter, _u64base,, _s32)(pg, base, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64base_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8i16.nxv2i64(<vscale x 8 x i16> [[DATA:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 0)
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svst1q_scatter_u64base_u16u10__SVBool_tu12__SVUint64_tu12__SVUint16_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8i16.nxv2i64(<vscale x 8 x i16> [[DATA:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 0)
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst1q_scatter_u64base_u16(svbool_t pg, svuint64_t base, svuint16_t data)
+{
+ SVE_ACLE_FUNC(svst1q_scatter, _u64base,, _u16)(pg, base, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64base_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8i16.nxv2i64(<vscale x 8 x i16> [[DATA:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 0)
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svst1q_scatter_u64base_s16u10__SVBool_tu12__SVUint64_tu11__SVInt16_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8i16.nxv2i64(<vscale x 8 x i16> [[DATA:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 0)
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst1q_scatter_u64base_s16(svbool_t pg, svuint64_t base, svint16_t data)
+{
+ SVE_ACLE_FUNC(svst1q_scatter, _u64base,, _s16)(pg, base, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64base_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv16i8.nxv2i64(<vscale x 16 x i8> [[DATA:%.*]], <vscale x 16 x i1> [[PG:%.*]], <vscale x 2 x i64> [[BASE:%.*]], i64 0)
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z30test_svst1q_scatter_u64base_u8u10__SVBool_tu12__SVUint64_tu11__SVUint8_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv16i8.nxv2i64(<vscale x 16 x i8> [[DATA:%.*]], <vscale x 16 x i1> [[PG:%.*]], <vscale x 2 x i64> [[BASE:%.*]], i64 0)
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst1q_scatter_u64base_u8(svbool_t pg, svuint64_t base, svuint8_t data)
+{
+ SVE_ACLE_FUNC(svst1q_scatter, _u64base,, _u8)(pg, base, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64base_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv16i8.nxv2i64(<vscale x 16 x i8> [[DATA:%.*]], <vscale x 16 x i1> [[PG:%.*]], <vscale x 2 x i64> [[BASE:%.*]], i64 0)
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z30test_svst1q_scatter_u64base_s8u10__SVBool_tu12__SVUint64_tu10__SVInt8_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv16i8.nxv2i64(<vscale x 16 x i8> [[DATA:%.*]], <vscale x 16 x i1> [[PG:%.*]], <vscale x 2 x i64> [[BASE:%.*]], i64 0)
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst1q_scatter_u64base_s8(svbool_t pg, svuint64_t base, svint8_t data)
+{
+ SVE_ACLE_FUNC(svst1q_scatter, _u64base,, _s8)(pg, base, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64base_f64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2f64.nxv2i64(<vscale x 2 x double> [[DATA:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 0)
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svst1q_scatter_u64base_f64u10__SVBool_tu12__SVUint64_tu13__SVFloat64_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2f64.nxv2i64(<vscale x 2 x double> [[DATA:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 0)
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst1q_scatter_u64base_f64(svbool_t pg, svuint64_t base, svfloat64_t data)
+{
+ SVE_ACLE_FUNC(svst1q_scatter, _u64base,, _f64)(pg, base, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64base_f32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv4f32.nxv2i64(<vscale x 4 x float> [[DATA:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 0)
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svst1q_scatter_u64base_f32u10__SVBool_tu12__SVUint64_tu13__SVFloat32_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv4f32.nxv2i64(<vscale x 4 x float> [[DATA:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 0)
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst1q_scatter_u64base_f32(svbool_t pg, svuint64_t base, svfloat32_t data)
+{
+ SVE_ACLE_FUNC(svst1q_scatter, _u64base,, _f32)(pg, base, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64base_f16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8f16.nxv2i64(<vscale x 8 x half> [[DATA:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 0)
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svst1q_scatter_u64base_f16u10__SVBool_tu12__SVUint64_tu13__SVFloat16_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8f16.nxv2i64(<vscale x 8 x half> [[DATA:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 0)
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst1q_scatter_u64base_f16(svbool_t pg, svuint64_t base, svfloat16_t data)
+{
+ SVE_ACLE_FUNC(svst1q_scatter, _u64base,, _f16)(pg, base, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64base_bf16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8bf16.nxv2i64(<vscale x 8 x bfloat> [[DATA:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 0)
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svst1q_scatter_u64base_bf16u10__SVBool_tu12__SVUint64_tu14__SVBfloat16_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8bf16.nxv2i64(<vscale x 8 x bfloat> [[DATA:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 0)
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst1q_scatter_u64base_bf16(svbool_t pg, svuint64_t base, svbfloat16_t data)
+{
+ SVE_ACLE_FUNC(svst1q_scatter, _u64base,,_bf16)(pg, base, data);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index a42e2c49cb477ba..ee2bcb0a9540d6b 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -2132,6 +2132,8 @@ def int_aarch64_sve_ld1_gather_uxtw_index : AdvSIMD_GatherLoad_SV_32b_Offsets_In
def int_aarch64_sve_ld1_gather_scalar_offset : AdvSIMD_GatherLoad_VS_Intrinsic;
+// 128-bit loads, unscaled offsets
+def int_aarch64_sve_ld1q_gather_scalar_offset : AdvSIMD_GatherLoad_VS_Intrinsic;
//
// First-faulting gather loads: scalar base + vector offsets
@@ -2208,6 +2210,9 @@ def int_aarch64_sve_st1_scatter_uxtw_index
def int_aarch64_sve_st1_scatter_scalar_offset : AdvSIMD_ScatterStore_VS_Intrinsic;
+// 128-bit stores, unscaled offsets
+def int_aarch64_sve_st1q_scatter_scalar_offset : AdvSIMD_ScatterStore_VS_Intrinsic;
+
//
// Non-temporal scatter stores: scalar base + vector offsets
//
@@ -2614,6 +2619,37 @@ def int_aarch64_sve_ld1_pn_x4 : SVE2p1_Load_PN_X4_Intrinsic;
def int_aarch64_sve_ldnt1_pn_x2 : SVE2p1_Load_PN_X2_Intrinsic;
def int_aarch64_sve_ldnt1_pn_x4 : SVE2p1_Load_PN_X4_Intrinsic;
+//
+// SVE2.1 - Contiguous loads to quadword (single vector)
+//
+
+class SVE2p1_Single_Load_Quadword
+ : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+ [llvm_nxv1i1_ty, llvm_ptr_ty],
+ [IntrReadMem]>;
+def int_aarch64_sve_ld1uwq : SVE2p1_Single_Load_Quadword;
+def int_aarch64_sve_ld1udq : SVE2p1_Single_Load_Quadword;
+
+//
+// SVE2.1 - Contiguous store from quadword (single vector)
+//
+
+class SVE2p1_Single_Store_Quadword
+ : DefaultAttrsIntrinsic<[],
+ [llvm_anyvector_ty, llvm_nxv1i1_ty, llvm_ptr_ty],
+ [IntrArgMemOnly]>;
+def int_aarch64_sve_st1uwq : SVE2p1_Single_Store_Quadword;
+def int_aarch64_sve_st1udq : SVE2p1_Single_Store_Quadword;
+
+
+def int_aarch64_sve_ld2q_sret : AdvSIMD_2Vec_PredLoad_Intrinsic;
+def int_aarch64_sve_ld3q_sret : AdvSIMD_3Vec_PredLoad_Intrinsic;
+def int_aarch64_sve_ld4q_sret : AdvSIMD_4Vec_PredLoad_Intrinsic;
+
+def int_aarch64_sve_st2q : AdvSIMD_2Vec_PredStore_Intrinsic;
+def int_aarch64_sve_st3q : AdvSIMD_3Vec_PredStore_Intrinsic;
+def int_aarch64_sve_st4q : AdvSIMD_4Vec_PredStore_Intrinsic;
+
//
// SVE2.1 - Contiguous stores to multiple consecutive vectors
//
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 38759a2474518fc..b57c604bc078fb0 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -1783,7 +1783,7 @@ void AArch64DAGToDAGISel::SelectDestructiveMultiIntrinsic(SDNode *N,
void AArch64DAGToDAGISel::SelectPredicatedLoad(SDNode *N, unsigned NumVecs,
unsigned Scale, unsigned Opc_ri,
unsigned Opc_rr, bool IsIntr) {
- assert(Scale < 4 && "Invalid scaling value.");
+ assert(Scale < 5 && "Invalid scaling value.");
SDLoc DL(N);
EVT VT = N->getValueType(0);
SDValue Chain = N->getOperand(0);
@@ -4688,6 +4688,18 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
case Intrinsic::aarch64_ld64b:
SelectLoad(Node, 8, AArch64::LD64B, AArch64::x8sub_0);
return;
+ case Intrinsic::aarch64_sve_ld2q_sret: {
+ SelectPredicatedLoad(Node, 2, 4, AArch64::LD2Q_IMM, AArch64::LD2Q, true);
+ return;
+ }
+ case Intrinsic::aarch64_sve_ld3q_sret: {
+ SelectPredicatedLoad(Node, 3, 4, AArch64::LD3Q_IMM, AArch64::LD3Q, true);
+ return;
+ }
+ case Intrinsic::aarch64_sve_ld4q_sret: {
+ SelectPredicatedLoad(Node, 4, 4, AArch64::LD4Q_IMM, AArch64::LD4Q, true);
+ return;
+ }
case Intrinsic::aarch64_sve_ld2_sret: {
if (VT == MVT::nxv16i8) {
SelectPredicatedLoad(Node, 2, 0, AArch64::LD2B_IMM, AArch64::LD2B,
@@ -5900,6 +5912,18 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
}
break;
}
+ case Intrinsic::aarch64_sve_st2q: {
+ SelectPredicatedStore(Node, 2, 4, AArch64::ST2Q, AArch64::ST2Q_IMM);
+ return;
+ }
+ case Intrinsic::aarch64_sve_st3q: {
+ SelectPredicatedStore(Node, 3, 4, AArch64::ST3Q, AArch64::ST3Q_IMM);
+ return;
+ }
+ case Intrinsic::aarch64_sve_st4q: {
+ SelectPredicatedStore(Node, 4, 4, AArch64::ST4Q, AArch64::ST4Q_IMM);
+ return;
+ }
case Intrinsic::aarch64_sve_st2: {
if (VT == MVT::nxv16i8) {
SelectPredicatedStore(Node, 2, 0, AArch64::ST2B, AArch64::ST2B_IMM);
@@ -6681,14 +6705,32 @@ static EVT getMemVTFromNode(LLVMContext &Ctx, SDNode *Root) {
return getPackedVectorTypeFromPredicateType(
Ctx, Root->getOperand(2)->getValueType(0), /*NumVec=*/1);
case Intrinsic::aarch64_sve_ld2_sret:
+ case Intrinsic::aarch64_sve_ld2q_sret:
return getPackedVectorTypeFromPredicateType(
Ctx, Root->getOperand(2)->getValueType(0), /*NumVec=*/2);
+ case Intrinsic::aarch64_sve_st2q:
+ return getPackedVectorTypeFromPredicateType(
+ Ctx, Root->getOperand(4)->getValueType(0), /*NumVec=*/2);
case Intrinsic::aarch64_sve_ld3_sret:
+ case Intrinsic::aarch64_sve_ld3q_sret:
return getPackedVectorTypeFromPredicateType(
Ctx, Root->getOperand(2)->getValueType(0), /*NumVec=*/3);
+ case Intrinsic::aarch64_sve_st3q:
+ return getPackedVectorTypeFromPredicateType(
+ Ctx, Root->getOperand(5)->getValueType(0), /*NumVec=*/3);
case Intrinsic::aarch64_sve_ld4_sret:
+ case Intrinsic::aarch64_sve_ld4q_sret:
return getPackedVectorTypeFromPredicateType(
Ctx, Root->getOperand(2)->getValueType(0), /*NumVec=*/4);
+ case Intrinsic::aarch64_sve_st4q:
+ return getPackedVectorTypeFromPredicateType(
+ Ctx, Root->getOperand(6)->getValueType(0), /*NumVec=*/4);
+ case Intrinsic::aarch64_sve_ld1udq:
+ case Intrinsic::aarch64_sve_st1udq:
+ return EVT(MVT::nxv1i64);
+ case Intrinsic::aarch64_sve_ld1uwq:
+ case Intrinsic::aarch64_sve_st1uwq:
+ return EVT(MVT::nxv1i32);
}
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 4daaf21d42b3ce4..c96eb86ab4dcc83 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2572,6 +2572,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
MAKE_CASE(AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO)
MAKE_CASE(AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO)
MAKE_CASE(AArch64ISD::GLD1_IMM_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLD1Q_MERGE_ZERO)
MAKE_CASE(AArch64ISD::GLD1S_MERGE_ZERO)
MAKE_CASE(AArch64ISD::GLD1S_SCALED_MERGE_ZERO)
MAKE_CASE(AArch64ISD::GLD1S_SXTW_MERGE_ZERO)
@@ -2596,6 +2597,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
MAKE_CASE(AArch64ISD::GLDNT1_MERGE_ZERO)
MAKE_CASE(AArch64ISD::GLDNT1_INDEX_MERGE_ZERO)
MAKE_CASE(AArch64ISD::GLDNT1S_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::SST1Q_PRED)
MAKE_CASE(AArch64ISD::ST1_PRED)
MAKE_CASE(AArch64ISD::SST1_PRED)
MAKE_CASE(AArch64ISD::SST1_SCALED_PRED)
@@ -22609,8 +22611,11 @@ static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
// For FPs, ACLE only supports _packed_ single and double precision types.
+ // SST1Q_PRED is the ST1Q for sve2p1 and should allow all sizes
if (SrcElVT.isFloatingPoint())
- if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64))
+ if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64) &&
+ (Opcode != AArch64ISD::SST1Q_PRED ||
+ ((SrcVT != MVT::nxv8f16) && (SrcVT != MVT::nxv8bf16))))
return SDValue();
// Depending on the addressing mode, this is either a pointer or a vector of
@@ -23579,6 +23584,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_INDEX_PRED);
case Intrinsic::aarch64_sve_ld1_gather:
return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_MERGE_ZERO);
+ case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
+ return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1Q_MERGE_ZERO);
case Intrinsic::aarch64_sve_ld1_gather_index:
return performGatherLoadCombine(N, DAG,
AArch64ISD::GLD1_SCALED_MERGE_ZERO);
@@ -23622,6 +23629,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
return performGatherLoadCombine(N, DAG,
AArch64ISD::GLDFF1_IMM_MERGE_ZERO);
+ case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
+ return performScatterStoreCombine(N, DAG, AArch64ISD::SST1Q_PRED);
case Intrinsic::aarch64_sve_st1_scatter:
return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_PRED);
case Intrinsic::aarch64_sve_st1_scatter_index:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 52e519cd8a0c93c..f05aaab86a23135 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -373,6 +373,7 @@ enum NodeType : unsigned {
GLD1_UXTW_SCALED_MERGE_ZERO,
GLD1_SXTW_SCALED_MERGE_ZERO,
GLD1_IMM_MERGE_ZERO,
+ GLD1Q_MERGE_ZERO,
// Signed gather loads
GLD1S_MERGE_ZERO,
@@ -417,6 +418,7 @@ enum NodeType : unsigned {
SST1_UXTW_SCALED_PRED,
SST1_SXTW_SCALED_PRED,
SST1_IMM_PRED,
+ SST1Q_PRED,
// Non-temporal scatter store
SSTNT1_PRED,
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 1a586765d58b3ca..7c10c8393d3024b 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -100,6 +100,8 @@ def AArch64ldff1s_gather_imm_z : SDNode<"AArch64ISD::GLDFF1S_IMM_MERGE_Z
def AArch64ldnt1_gather_z : SDNode<"AArch64ISD::GLDNT1_MERGE_ZERO", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad]>;
def AArch64ldnt1s_gather_z : SDNode<"AArch64ISD::GLDNT1S_MERGE_ZERO", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad]>;
+// Gather vector base + scalar offset
+def AArch64ld1q_gather_z: SDNode<"AArch64ISD::GLD1Q_MERGE_ZERO", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad]>;
// Contiguous stores - node definitions
//
@@ -132,6 +134,9 @@ def AArch64st1_scatter_imm : SDNode<"AArch64ISD::SST1_IMM_PRED",
def AArch64stnt1_scatter : SDNode<"AArch64ISD::SSTNT1_PRED", SDT_AArch64_SCATTER_VS, [SDNPHasChain, SDNPMayStore]>;
+// Scatter vector base + scalar offset
+def AArch64st1q_scatter : SDNode<"AArch64ISD::SST1Q_PRED", SDT_AArch64_SCATTER_VS, [SDNPHasChain, SDNPMayStore]>;
+
// AArch64 SVE/SVE2 - the remaining node definitions
//
@@ -1157,7 +1162,7 @@ let Predicates = [HasSVE] in {
defm GLD1D : sve_mem_64b_gld_vs2_64_unscaled<0b1110, "ld1d", AArch64ld1_gather_z, nxv2i64>;
defm GLDFF1D : sve_mem_64b_gld_vs2_64_unscaled<0b1111, "ldff1d", AArch64ldff1_gather_z, nxv2i64>;
let Predicates = [HasSVE2p1] in {
- defm GLD1Q : sve_mem_128b_gld_64_unscaled<"ld1q">;
+ defm GLD1Q : sve_mem_128b_gld_64_unscaled<"ld1q", AArch64ld1q_gather_z>;
}
// Gathers using scaled 64-bit offsets, e.g.
@@ -1338,6 +1343,49 @@ let Predicates = [HasSVEorSME] in {
let Predicates = [HasSVE2p1] in {
defm ST1D_Q : sve_mem_cst_ss<0b1110, "st1d", Z_q, ZPR128, GPR64NoXZRshifted64>;
}
+
+ multiclass sve_ld1q_pat<ValueType Ty, ValueType PredTy, SDPatternOperator Load1qOp, Instruction RegRegInst, Instruction RegImmInst, ComplexPattern AddrCP> {
+ let AddedComplexity = 2 in {
+ def _reg_imm : Pat<(Ty (Load1qOp (PredTy PPR3bAny:$Pg), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$imm))),
+ (RegImmInst PPR3bAny:$Pg, GPR64sp:$base, simm4s1:$imm)>;
+ }
+
+ let AddedComplexity = 1 in {
+ def _reg_reg : Pat<(Ty (Load1qOp (PredTy PPR3bAny:$Pg), (AddrCP GPR64sp:$base, GPR64:$offset))),
+ (RegRegInst PPR3bAny:$Pg, GPR64sp:$base, GPR64:$offset)>;
+ }
+
+ def _default : Pat<(Ty (Load1qOp (PredTy PPR3bAny:$Pg), (i64 GPR64sp:$base))),
+ (RegImmInst PPR3bAny:$Pg, GPR64sp:$base, (i64 0))>;
+ }
+
+ multiclass sve_st1q_pat<ValueType DataType, ValueType PredTy, SDPatternOperator Store1qOp, Instruction RegRegInst, Instruction RegImmInst, ComplexPattern AddrCP> {
+ let AddedComplexity = 2 in {
+ def _reg_imm : Pat<(Store1qOp (DataType ZPR128:$Zt), (PredTy PPR3bAny:$Pg), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$imm)),
+ (RegImmInst Z_q:$Zt, PPR3bAny:$Pg, GPR64sp:$base, simm4s1:$imm)>;
+ }
+
+ let AddedComplexity = 1 in {
+ def _reg_reg : Pat<(Store1qOp (DataType ZPR128:$Zt), (PredTy PPR3bAny:$Pg), (AddrCP GPR64sp:$base, GPR64:$offset)),
+ (RegRegInst Z_q:$Zt, PPR3bAny:$Pg, GPR64sp:$base, GPR64:$offset)>;
+ }
+
+ def _default : Pat<(Store1qOp (DataType ZPR128:$Zt), (PredTy PPR3bAny:$Pg), (i64 GPR64sp:$base)),
+ (RegImmInst Z_q:$Zt, PPR3bAny:$Pg, GPR64sp:$base, (i64 0))>;
+ }
+
+ // ld1quw/st1quw
+ defm : sve_ld1q_pat<nxv4i32, nxv1i1, int_aarch64_sve_ld1uwq, LD1W_Q, LD1W_Q_IMM, am_sve_regreg_lsl2>;
+ defm : sve_ld1q_pat<nxv4f32, nxv1i1, int_aarch64_sve_ld1uwq, LD1W_Q, LD1W_Q_IMM, am_sve_regreg_lsl2>;
+ defm : sve_st1q_pat<nxv4i32, nxv1i1, int_aarch64_sve_st1uwq, ST1W_Q, ST1W_Q_IMM, am_sve_regreg_lsl2>;
+ defm : sve_st1q_pat<nxv4f32, nxv1i1, int_aarch64_sve_st1uwq, ST1W_Q, ST1W_Q_IMM, am_sve_regreg_lsl2>;
+
+ // ld1qud/st1qud
+ defm : sve_ld1q_pat<nxv2i64, nxv1i1, int_aarch64_sve_ld1udq, LD1D_Q, LD1D_Q_IMM, am_sve_regreg_lsl3>;
+ defm : sve_ld1q_pat<nxv2f64, nxv1i1, int_aarch64_sve_ld1udq, LD1D_Q, LD1D_Q_IMM, am_sve_regreg_lsl3>;
+ defm : sve_st1q_pat<nxv2i64, nxv1i1, int_aarch64_sve_st1udq, ST1D_Q, ST1D_Q_IMM, am_sve_regreg_lsl3>;
+ defm : sve_st1q_pat<nxv2f64, nxv1i1, int_aarch64_sve_st1udq, ST1D_Q, ST1D_Q_IMM, am_sve_regreg_lsl3>;
+
} // End HasSVEorSME
let Predicates = [HasSVE] in {
@@ -1385,7 +1433,7 @@ let Predicates = [HasSVE] in {
defm SST1W_D : sve_mem_sst_sv_64_unscaled<0b10, "st1w", AArch64st1_scatter, nxv2i32>;
defm SST1D : sve_mem_sst_sv_64_unscaled<0b11, "st1d", AArch64st1_scatter, nxv2i64>;
let Predicates = [HasSVE2p1] in {
- defm SST1Q : sve_mem_sst_128b_64_unscaled<"st1q">;
+ defm SST1Q : sve_mem_sst_128b_64_unscaled<"st1q", AArch64st1q_scatter>;
}
// Scatters using scaled 64-bit offsets, e.g.
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index d2f72fda3a229b8..36f40fecc8c0d62 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -9746,11 +9746,30 @@ class sve_mem_128b_gld_64_unscaled<string mnemonic>
}
-multiclass sve_mem_128b_gld_64_unscaled<string mnemonic> {
+multiclass sve_mem_128b_gld_64_unscaled<string mnemonic, SDPatternOperator op> {
def NAME : sve_mem_128b_gld_64_unscaled<mnemonic>;
def : InstAlias<mnemonic # " $Zt, $Pg/z, [$Zn]",
(!cast<Instruction>(NAME) Z_q:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 1>;
+
+
+ def : Pat<(nxv2i64 (op (nxv2i1 PPR3bAny:$Pg), (nxv2i64 ZPR64:$Zn), (i64 GPR64sp:$Rm), nxv2i64)),
+ (!cast<Instruction>(NAME) PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm)>;
+ def : Pat<(nxv4i32 (op (nxv4i1 PPR3bAny:$Pg), (nxv2i64 ZPR64:$Zn), (i64 GPR64sp:$Rm), nxv4i32)),
+ (!cast<Instruction>(NAME) PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm)>;
+ def : Pat<(nxv8i16 (op (nxv8i1 PPR3bAny:$Pg), (nxv2i64 ZPR64:$Zn), (i64 GPR64sp:$Rm), nxv8i16)),
+ (!cast<Instruction>(NAME) PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm)>;
+ def : Pat<(nxv16i8 (op (nxv16i1 PPR3bAny:$Pg), (nxv2i64 ZPR64:$Zn), (i64 GPR64sp:$Rm), nxv16i8)),
+ (!cast<Instruction>(NAME) PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm)>;
+
+ def : Pat<(nxv2f64 (op (nxv2i1 PPR3bAny:$Pg), (nxv2i64 ZPR64:$Zn), (i64 GPR64sp:$Rm), nxv2f64)),
+ (!cast<Instruction>(NAME) PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm)>;
+ def : Pat<(nxv4f32 (op (nxv4i1 PPR3bAny:$Pg), (nxv2i64 ZPR64:$Zn), (i64 GPR64sp:$Rm), nxv4f32)),
+ (!cast<Instruction>(NAME) PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm)>;
+ def : Pat<(nxv8f16 (op (nxv8i1 PPR3bAny:$Pg), (nxv2i64 ZPR64:$Zn), (i64 GPR64sp:$Rm), nxv8f16)),
+ (!cast<Instruction>(NAME) PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm)>;
+ def : Pat<(nxv8bf16 (op (nxv8i1 PPR3bAny:$Pg), (nxv2i64 ZPR64:$Zn), (i64 GPR64sp:$Rm), nxv8bf16)),
+ (!cast<Instruction>(NAME) PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm)>;
}
class sve_mem_sst_128b_64_unscaled<string mnemonic>
@@ -9773,11 +9792,29 @@ class sve_mem_sst_128b_64_unscaled<string mnemonic>
}
-multiclass sve_mem_sst_128b_64_unscaled<string mnemonic> {
+multiclass sve_mem_sst_128b_64_unscaled<string mnemonic, SDPatternOperator op> {
def NAME : sve_mem_sst_128b_64_unscaled<mnemonic>;
def : InstAlias<mnemonic # " $Zt, $Pg, [$Zn]",
(!cast<Instruction>(NAME) Z_q:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 1>;
+
+ def : Pat<(op (nxv2i64 Z_q:$Zt), (nxv2i1 PPR3bAny:$gp), (nxv2i64 ZPR64:$Zn), (i64 GPR64sp:$Rm), nxv2i64),
+ (!cast<Instruction>(NAME) Z_q:$Zt, PPR3bAny:$gp, ZPR64:$Zn, GPR64:$Rm)>;
+ def : Pat<(op (nxv4i32 Z_q:$Zt), (nxv4i1 PPR3bAny:$gp), (nxv2i64 ZPR64:$Zn), (i64 GPR64sp:$Rm), nxv4i32),
+ (!cast<Instruction>(NAME) Z_q:$Zt, PPR3bAny:$gp, ZPR64:$Zn, GPR64:$Rm)>;
+ def : Pat<(op (nxv8i16 Z_q:$Zt), (nxv8i1 PPR3bAny:$gp), (nxv2i64 ZPR64:$Zn), (i64 GPR64sp:$Rm), nxv8i16),
+ (!cast<Instruction>(NAME) Z_q:$Zt, PPR3bAny:$gp,ZPR64:$Zn, GPR64:$Rm)>;
+ def : Pat<(op (nxv16i8 Z_q:$Zt), (nxv16i1 PPR3bAny:$gp), (nxv2i64 ZPR64:$Zn), (i64 GPR64sp:$Rm), nxv16i8),
+ (!cast<Instruction>(NAME) Z_q:$Zt, PPR3bAny:$gp, ZPR64:$Zn, GPR64:$Rm)>;
+
+ def : Pat<(op (nxv2f64 Z_q:$Zt), (nxv2i1 PPR3bAny:$gp), (nxv2i64 ZPR64:$Zn), (i64 GPR64sp:$Rm), nxv2f64),
+ (!cast<Instruction>(NAME) Z_q:$Zt, PPR3bAny:$gp, ZPR64:$Zn, GPR64:$Rm)>;
+ def : Pat<(op (nxv4f32 Z_q:$Zt), (nxv4i1 PPR3bAny:$gp), (nxv2i64 ZPR64:$Zn), (i64 GPR64sp:$Rm), nxv4f32),
+ (!cast<Instruction>(NAME) Z_q:$Zt, PPR3bAny:$gp, ZPR64:$Zn, GPR64:$Rm)>;
+ def : Pat<(op (nxv8f16 Z_q:$Zt), (nxv8i1 PPR3bAny:$gp), (nxv2i64 ZPR64:$Zn), (i64 GPR64sp:$Rm), nxv8f16),
+ (!cast<Instruction>(NAME) Z_q:$Zt, PPR3bAny:$gp, ZPR64:$Zn, GPR64:$Rm)>;
+ def : Pat<(op (nxv8bf16 Z_q:$Zt), (nxv8i1 PPR3bAny:$gp), (nxv2i64 ZPR64:$Zn), (i64 GPR64sp:$Rm), nxv8bf16),
+ (!cast<Instruction>(NAME) Z_q:$Zt, PPR3bAny:$gp, ZPR64:$Zn, GPR64:$Rm)>;
}
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-gather-loads-128bit-unscaled-offset.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-gather-loads-128bit-unscaled-offset.ll
new file mode 100644
index 000000000000000..928bc07514b0ed4
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-gather-loads-128bit-unscaled-offset.ll
@@ -0,0 +1,105 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1,+bf16 < %s | FileCheck %s
+
+;
+; LD1Q: vector base + unscaled offset
+; e.g. ld1q { z0.q }, p0/z, [z0.d, x0]
+;
+define <vscale x 16 x i8> @ld1q_gather_u64base_i8(<vscale x 16 x i1> %pg, <vscale x 2 x i64> %base, i64 %offset) {
+; CHECK-LABEL: ld1q_gather_u64base_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1q { z0.q }, p0/z, [z0.d, x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv16i8.nxv2i64(<vscale x 16 x i1> %pg,
+ <vscale x 2 x i64> %base,
+ i64 %offset)
+ ret <vscale x 16 x i8> %load
+}
+
+define <vscale x 8 x i16> @ld1q_gather_u64base_i16(<vscale x 8 x i1> %pg, <vscale x 2 x i64> %base, i64 %offset) {
+; CHECK-LABEL: ld1q_gather_u64base_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1q { z0.q }, p0/z, [z0.d, x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 8 x i16> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8i16.nxv2i64(<vscale x 8 x i1> %pg,
+ <vscale x 2 x i64> %base,
+ i64 %offset)
+ ret <vscale x 8 x i16> %load
+}
+
+define <vscale x 4 x i32> @ld1q_gather_u64base_i32(<vscale x 4 x i1> %pg, <vscale x 2 x i64> %base, i64 %offset) {
+; CHECK-LABEL: ld1q_gather_u64base_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1q { z0.q }, p0/z, [z0.d, x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv4i32.nxv2i64(<vscale x 4 x i1> %pg,
+ <vscale x 2 x i64> %base,
+ i64 %offset)
+ ret <vscale x 4 x i32> %load
+}
+
+define <vscale x 2 x i64> @ld1q_gather_u64base_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 %offset) {
+; CHECK-LABEL: ld1q_gather_u64base_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1q { z0.q }, p0/z, [z0.d, x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %base,
+ i64 %offset)
+ ret <vscale x 2 x i64> %load
+}
+
+define <vscale x 8 x half> @ld1q_gather_u64base_f16(<vscale x 8 x i1> %pg, <vscale x 2 x i64> %base, i64 %offset) {
+; CHECK-LABEL: ld1q_gather_u64base_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1q { z0.q }, p0/z, [z0.d, x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 8 x half> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8f16.nxv2i64(<vscale x 8 x i1> %pg,
+ <vscale x 2 x i64> %base,
+ i64 %offset)
+ ret <vscale x 8 x half> %load
+}
+
+define <vscale x 4 x float> @ld1q_gather_u64base_f32(<vscale x 4 x i1> %pg, <vscale x 2 x i64> %base, i64 %offset) {
+; CHECK-LABEL: ld1q_gather_u64base_f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1q { z0.q }, p0/z, [z0.d, x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x float> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv4f32.nxv2i64(<vscale x 4 x i1> %pg,
+ <vscale x 2 x i64> %base,
+ i64 %offset)
+ ret <vscale x 4 x float> %load
+}
+
+
+define <vscale x 2 x double> @ld1q_gather_u64base_f64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 %offset) {
+; CHECK-LABEL: ld1q_gather_u64base_f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1q { z0.q }, p0/z, [z0.d, x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x double> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv2f64.nxv2i64(<vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %base,
+ i64 %offset)
+ ret <vscale x 2 x double> %load
+}
+
+define <vscale x 8 x bfloat> @ld1q_gather_u64base_bf16(<vscale x 8 x i1> %pg, <vscale x 2 x i64> %base, i64 %offset) {
+; CHECK-LABEL: ld1q_gather_u64base_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1q { z0.q }, p0/z, [z0.d, x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8bf16.nxv2i64(<vscale x 8 x i1> %pg,
+ <vscale x 2 x i64> %base,
+ i64 %offset)
+ ret <vscale x 8 x bfloat> %load
+}
+
+declare <vscale x 16 x i8> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv16i8.nxv2i64(<vscale x 16 x i1>, <vscale x 2 x i64>, i64)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8i16.nxv2i64(<vscale x 8 x i1>, <vscale x 2 x i64>, i64)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv4i32.nxv2i64(<vscale x 4 x i1>, <vscale x 2 x i64>, i64)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, i64)
+declare <vscale x 8 x half> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8f16.nxv2i64(<vscale x 8 x i1>, <vscale x 2 x i64>, i64)
+declare <vscale x 4 x float> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv4f32.nxv2i64(<vscale x 4 x i1>, <vscale x 2 x i64>, i64)
+declare <vscale x 2 x double> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv2f64.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, i64)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8bf16.nxv2i64(<vscale x 8 x i1>, <vscale x 2 x i64>, i64)
+
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-ld1-single.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-ld1-single.ll
new file mode 100644
index 000000000000000..1fbbab875403bb4
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-ld1-single.ll
@@ -0,0 +1,144 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve2p1 < %s | FileCheck %s
+
+; LD1W
+
+define <vscale x 4 x i32> @test_svld1uwq_i32_ss(<vscale x 1 x i1> %pred, ptr %base, i64 %offset) {
+; CHECK-LABEL: test_svld1uwq_i32_ss:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1w { z0.q }, p0/z, [x0, x1, lsl #2]
+; CHECK-NEXT: ret
+ %gep = getelementptr i32, ptr %base, i64 %offset
+ %res = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1uwq.nxv4i32(<vscale x 1 x i1> %pred, ptr %gep)
+ ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 4 x i32> @test_svld1uwq_i32_si(<vscale x 1 x i1> %pred, <vscale x 4 x i32>* %base) {
+; CHECK-LABEL: test_svld1uwq_i32_si:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1w { z0.q }, p0/z, [x0, #-8, mul vl]
+; CHECK-NEXT: ld1w { z1.q }, p0/z, [x0, #7, mul vl]
+; CHECK-NEXT: add z0.s, z0.s, z1.s
+; CHECK-NEXT: ret
+ %gep1 = getelementptr inbounds <vscale x 1 x i32>, <vscale x 1 x i32>* %base, i64 -8
+ %res1 = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1uwq.nxv4i32(<vscale x 1 x i1> %pred, ptr %gep1)
+
+ %gep2 = getelementptr inbounds <vscale x 1 x i32>, <vscale x 1 x i32>* %base, i64 7
+ %res2 = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1uwq.nxv4i32(<vscale x 1 x i1> %pred, ptr %gep2)
+
+ %res = add <vscale x 4 x i32> %res1, %res2
+ ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 4 x i32> @test_svld1uwq_i32_out_of_bound(<vscale x 1 x i1> %pred, <vscale x 4 x i32>* %base) {
+; CHECK-LABEL: test_svld1uwq_i32_out_of_bound:
+; CHECK: // %bb.0:
+; CHECK-NEXT: addvl x8, x0, #2
+; CHECK-NEXT: ld1w { z0.q }, p0/z, [x8]
+; CHECK-NEXT: ret
+ %gep = getelementptr inbounds <vscale x 1 x i32>, <vscale x 1 x i32>* %base, i64 8
+ %res = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1uwq.nxv4i32(<vscale x 1 x i1> %pred, ptr %gep)
+
+ ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 4 x float> @test_svld1uwq_f32_ss(<vscale x 1 x i1> %pred, ptr %base, i64 %offset) {
+; CHECK-LABEL: test_svld1uwq_f32_ss:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1w { z0.q }, p0/z, [x0, x1, lsl #2]
+; CHECK-NEXT: ret
+ %gep = getelementptr float, ptr %base, i64 %offset
+ %res = call <vscale x 4 x float> @llvm.aarch64.sve.ld1uwq.nxv4f32(<vscale x 1 x i1> %pred, ptr %gep)
+ ret <vscale x 4 x float> %res
+}
+
+define <vscale x 4 x float> @test_svld1uwq_f32_si(<vscale x 1 x i1> %pred, <vscale x 1 x float>* %base) {
+; CHECK-LABEL: test_svld1uwq_f32_si:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1w { z0.q }, p0/z, [x0, #-8, mul vl]
+; CHECK-NEXT: ld1w { z1.q }, p0/z, [x0, #7, mul vl]
+; CHECK-NEXT: fadd z0.s, z0.s, z1.s
+; CHECK-NEXT: ret
+ %gep1 = getelementptr inbounds <vscale x 1 x float>, <vscale x 1 x float>* %base, i64 -8
+ %res1 = call <vscale x 4 x float> @llvm.aarch64.sve.ld1uwq.nxv4f32(<vscale x 1 x i1> %pred, ptr %gep1)
+
+ %gep2 = getelementptr inbounds <vscale x 1 x float>, <vscale x 1 x float>* %base, i64 7
+ %res2 = call <vscale x 4 x float> @llvm.aarch64.sve.ld1uwq.nxv4f32(<vscale x 1 x i1> %pred, ptr %gep2)
+
+ %res = fadd <vscale x 4 x float> %res1, %res2
+ ret <vscale x 4 x float> %res
+}
+
+; LD1D
+
+define <vscale x 2 x i64> @test_svld1udq_i64_ss(<vscale x 1 x i1> %pred, ptr %base, i64 %offset) {
+; CHECK-LABEL: test_svld1udq_i64_ss:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1d { z0.q }, p0/z, [x0, x1, lsl #3]
+; CHECK-NEXT: ret
+ %gep = getelementptr i64, ptr %base, i64 %offset
+ %res = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1udq.nxv2i64(<vscale x 1 x i1> %pred, ptr %gep)
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @test_svld1udq_i64_si(<vscale x 1 x i1> %pred, <vscale x 1 x i64>* %base) {
+; CHECK-LABEL: test_svld1udq_i64_si:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1d { z0.q }, p0/z, [x0, #-8, mul vl]
+; CHECK-NEXT: ld1d { z1.q }, p0/z, [x0, #7, mul vl]
+; CHECK-NEXT: add z0.d, z0.d, z1.d
+; CHECK-NEXT: ret
+ %gep1 = getelementptr inbounds <vscale x 1 x i64>, <vscale x 1 x i64>* %base, i64 -8
+ %res1 = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1udq.nxv2i64(<vscale x 1 x i1> %pred, ptr %gep1)
+
+ %gep2 = getelementptr inbounds <vscale x 1 x i64>, <vscale x 1 x i64>* %base, i64 7
+ %res2 = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1udq.nxv2i64(<vscale x 1 x i1> %pred, ptr %gep2)
+
+ %res = add <vscale x 2 x i64> %res1, %res2
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @test_svld1udq_i64_out_of_bound(<vscale x 1 x i1> %pred, <vscale x 1 x i64>* %base) {
+; CHECK-LABEL: test_svld1udq_i64_out_of_bound:
+; CHECK: // %bb.0:
+; CHECK-NEXT: addvl x8, x0, #-5
+; CHECK-NEXT: ld1d { z0.q }, p0/z, [x8]
+; CHECK-NEXT: ret
+ %gep = getelementptr inbounds <vscale x 1 x i64>, <vscale x 1 x i64>* %base, i64 -10
+ %res = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1udq.nxv2i64(<vscale x 1 x i1> %pred, ptr %gep)
+
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x double> @test_svld1udq_f64_ss(<vscale x 1 x i1> %pred, ptr %base, i64 %offset) {
+; CHECK-LABEL: test_svld1udq_f64_ss:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1d { z0.q }, p0/z, [x0, x1, lsl #3]
+; CHECK-NEXT: ret
+ %gep = getelementptr double, ptr %base, i64 %offset
+ %res = call <vscale x 2 x double> @llvm.aarch64.sve.ld1udq.nxv2f64(<vscale x 1 x i1> %pred, ptr %gep)
+ ret <vscale x 2 x double> %res
+}
+
+define <vscale x 2 x double> @test_svld1udq_f64_si(<vscale x 1 x i1> %pred, <vscale x 1 x double>* %base) {
+; CHECK-LABEL: test_svld1udq_f64_si:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1d { z0.q }, p0/z, [x0, #-8, mul vl]
+; CHECK-NEXT: ld1d { z1.q }, p0/z, [x0, #7, mul vl]
+; CHECK-NEXT: fadd z0.d, z0.d, z1.d
+; CHECK-NEXT: ret
+ %gep1 = getelementptr inbounds <vscale x 1 x double>, <vscale x 1 x double>* %base, i64 -8
+ %res1 = call <vscale x 2 x double> @llvm.aarch64.sve.ld1udq.nxv2f64(<vscale x 1 x i1> %pred, ptr %gep1)
+
+ %gep2 = getelementptr inbounds <vscale x 1 x double>, <vscale x 1 x double>* %base, i64 7
+ %res2 = call <vscale x 2 x double> @llvm.aarch64.sve.ld1udq.nxv2f64(<vscale x 1 x i1> %pred, ptr %gep2)
+
+ %res = fadd <vscale x 2 x double> %res1, %res2
+ ret <vscale x 2 x double> %res
+}
+
+declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1uwq.nxv4i32(<vscale x 1 x i1>, ptr)
+declare <vscale x 4 x float> @llvm.aarch64.sve.ld1uwq.nxv4f32(<vscale x 1 x i1>, ptr)
+
+declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1udq.nxv2i64(<vscale x 1 x i1>, ptr)
+declare <vscale x 2 x double> @llvm.aarch64.sve.ld1udq.nxv2f64(<vscale x 1 x i1>, ptr)
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-multivec-loads.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-multivec-loads.ll
new file mode 100644
index 000000000000000..a5d6bd4f930e92b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-multivec-loads.ll
@@ -0,0 +1,797 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1,+bf16 < %s | FileCheck %s
+
+;;LD2Q
+
+define { <vscale x 16 x i8>, <vscale x 16 x i8> } @ld2q_si_i8_off16(<vscale x 16 x i1> %pg, <vscale x 16 x i8> *%addr ) {
+; CHECK-LABEL: ld2q_si_i8_off16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0, #-16, mul vl]
+; CHECK-NEXT: ret
+ %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 -16
+ %base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
+ %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld2q.sret.nxv16i8(<vscale x 16 x i1> %pg, ptr %base_ptr);
+ ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res
+}
+
+define { <vscale x 16 x i8>, <vscale x 16 x i8> } @ld2q_si_i8_off14(<vscale x 16 x i1> %pg, <vscale x 16 x i8> *%addr ) {
+; CHECK-LABEL: ld2q_si_i8_off14:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0, #14, mul vl]
+; CHECK-NEXT: ret
+ %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 14
+ %base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
+ %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld2q.sret.nxv16i8(<vscale x 16 x i1> %pg, ptr %base_ptr);
+ ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res
+}
+
+define { <vscale x 16 x i8>, <vscale x 16 x i8> } @ld2q_ss_i8(<vscale x 16 x i1> %pg, ptr %addr, i64 %a) {
+; CHECK-LABEL: ld2q_ss_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0, x1, lsl #4]
+; CHECK-NEXT: ret
+ %addr2 = getelementptr i128, ptr %addr, i64 %a
+ %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld2q.sret.nxv16i8(<vscale x 16 x i1> %pg, ptr %addr2);
+ ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res
+}
+
+define { <vscale x 16 x i8>, <vscale x 16 x i8> } @ld2q_i8(<vscale x 16 x i1> %pg, ptr %addr) {
+; CHECK-LABEL: ld2q_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld2q.sret.nxv16i8(<vscale x 16 x i1> %pg, ptr %addr);
+ ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res
+}
+
+define { <vscale x 8 x i16>, <vscale x 8 x i16> } @ld2q_si_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> *%addr ) {
+; CHECK-LABEL: ld2q_si_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0, #-16, mul vl]
+; CHECK-NEXT: ret
+ %base = getelementptr <vscale x 8 x i16>, <vscale x 8 x i16>* %addr, i64 -16
+ %base_ptr = bitcast <vscale x 8 x i16>* %base to i16 *
+ %res = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld2q.sret.nxv8i16(<vscale x 8 x i1> %pg, ptr %base_ptr);
+ ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %res
+}
+
+define { <vscale x 8 x i16>, <vscale x 8 x i16> } @ld2q_ss_i16(<vscale x 8 x i1> %pg, ptr %addr, i64 %a) {
+; CHECK-LABEL: ld2q_ss_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0, x1, lsl #4]
+; CHECK-NEXT: ret
+ %addr2 = getelementptr i128, ptr %addr, i64 %a
+ %res = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld2q.sret.nxv8i16(<vscale x 8 x i1> %pg, ptr %addr2);
+ ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %res
+}
+
+define { <vscale x 8 x i16>, <vscale x 8 x i16> } @ld2q_i16(<vscale x 8 x i1> %pg, ptr %addr) {
+; CHECK-LABEL: ld2q_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %res = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld2q.sret.nxv8i16(<vscale x 8 x i1> %pg, ptr %addr);
+ ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %res
+}
+
+define { <vscale x 4 x i32>, <vscale x 4 x i32> } @ld2q_si_i32(<vscale x 4 x i1> %pg, ptr %addr ) {
+; CHECK-LABEL: ld2q_si_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0, #-16, mul vl]
+; CHECK-NEXT: ret
+ %base = getelementptr <vscale x 4 x i32>, ptr %addr, i64 -16
+ %res = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2q.sret.nxv4i32(<vscale x 4 x i1> %pg, ptr %base);
+ ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %res
+}
+
+define { <vscale x 4 x i32>, <vscale x 4 x i32> } @ld2q_ss_i32(<vscale x 4 x i1> %pg, ptr %addr, i64 %a) {
+; CHECK-LABEL: ld2q_ss_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0, x1, lsl #4]
+; CHECK-NEXT: ret
+ %addr2 = getelementptr i128, ptr %addr, i64 %a
+ %res = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2q.sret.nxv4i32(<vscale x 4 x i1> %pg, ptr %addr2);
+ ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %res
+}
+
+define { <vscale x 4 x i32>, <vscale x 4 x i32> } @ld2q_i32(<vscale x 4 x i1> %pg, ptr %addr) {
+; CHECK-LABEL: ld2q_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %res = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2q.sret.nxv4i32(<vscale x 4 x i1> %pg, ptr %addr);
+ ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %res
+}
+
+define { <vscale x 2 x i64>, <vscale x 2 x i64> } @ld2q_si_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> *%addr ) {
+; CHECK-LABEL: ld2q_si_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0, #-16, mul vl]
+; CHECK-NEXT: ret
+ %base = getelementptr <vscale x 2 x i64>, <vscale x 2 x i64>* %addr, i64 -16
+ %base_ptr = bitcast <vscale x 2 x i64>* %base to i64 *
+ %res = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2q.sret.nxv2i64(<vscale x 2 x i1> %pg, ptr %base_ptr);
+ ret { <vscale x 2 x i64>, <vscale x 2 x i64> } %res
+}
+
+define { <vscale x 2 x i64>, <vscale x 2 x i64> } @ld2q_ss_i64(<vscale x 2 x i1> %pg, ptr %addr, i64 %a) {
+; CHECK-LABEL: ld2q_ss_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0, x1, lsl #4]
+; CHECK-NEXT: ret
+ %addr2 = getelementptr i128, ptr %addr, i64 %a
+ %res = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2q.sret.nxv2i64(<vscale x 2 x i1> %pg, ptr %addr2);
+ ret { <vscale x 2 x i64>, <vscale x 2 x i64> } %res
+}
+
+define { <vscale x 2 x i64>, <vscale x 2 x i64> } @ld2q_i64(<vscale x 2 x i1> %pg, ptr %addr) {
+; CHECK-LABEL: ld2q_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %res = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2q.sret.nxv2i64(<vscale x 2 x i1> %pg, ptr %addr);
+ ret { <vscale x 2 x i64>, <vscale x 2 x i64> } %res
+}
+
+define { <vscale x 8 x half>, <vscale x 8 x half> } @ld2q_si_f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> *%addr ) {
+; CHECK-LABEL: ld2q_si_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0, #-16, mul vl]
+; CHECK-NEXT: ret
+ %base = getelementptr <vscale x 8 x half>, <vscale x 8 x half>* %addr, i64 -16
+ %base_ptr = bitcast <vscale x 8 x half>* %base to half *
+ %res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld2q.sret.nxv8f16(<vscale x 8 x i1> %pg, ptr %base_ptr);
+ ret { <vscale x 8 x half>, <vscale x 8 x half> } %res
+}
+
+define { <vscale x 8 x half>, <vscale x 8 x half> } @ld2q_ss_f16(<vscale x 8 x i1> %pg, ptr %addr, i64 %a) {
+; CHECK-LABEL: ld2q_ss_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0, x1, lsl #4]
+; CHECK-NEXT: ret
+ %addr2 = getelementptr i128, ptr %addr, i64 %a
+ %res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld2q.sret.nxv8f16(<vscale x 8 x i1> %pg, ptr %addr2);
+ ret { <vscale x 8 x half>, <vscale x 8 x half> } %res
+}
+
+define { <vscale x 8 x half>, <vscale x 8 x half> } @ld2q_f16(<vscale x 8 x i1> %pg, ptr %addr) {
+; CHECK-LABEL: ld2q_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld2q.sret.nxv8f16(<vscale x 8 x i1> %pg, ptr %addr);
+ ret { <vscale x 8 x half>, <vscale x 8 x half> } %res
+}
+
+define { <vscale x 4 x float>, <vscale x 4 x float> } @ld2q_si_f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> *%addr ) {
+; CHECK-LABEL: ld2q_si_f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0, #-16, mul vl]
+; CHECK-NEXT: ret
+ %base = getelementptr <vscale x 4 x float>, <vscale x 4 x float>* %addr, i64 -16
+ %base_ptr = bitcast <vscale x 4 x float>* %base to float *
+ %res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld2q.sret.nxv4f32(<vscale x 4 x i1> %pg, ptr %base_ptr);
+ ret { <vscale x 4 x float>, <vscale x 4 x float> } %res
+}
+
+define { <vscale x 4 x float>, <vscale x 4 x float> } @ld2q_ss_f32(<vscale x 4 x i1> %pg, ptr %addr, i64 %a) {
+; CHECK-LABEL: ld2q_ss_f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0, x1, lsl #4]
+; CHECK-NEXT: ret
+ %addr2 = getelementptr i128, ptr %addr, i64 %a
+ %res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld2q.sret.nxv4f32(<vscale x 4 x i1> %pg, ptr %addr2);
+ ret { <vscale x 4 x float>, <vscale x 4 x float> } %res
+}
+
+define { <vscale x 4 x float>, <vscale x 4 x float> } @ld2q_f32(<vscale x 4 x i1> %pg, ptr %addr) {
+; CHECK-LABEL: ld2q_f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld2q.sret.nxv4f32(<vscale x 4 x i1> %pg, ptr %addr);
+ ret { <vscale x 4 x float>, <vscale x 4 x float> } %res
+}
+
+define { <vscale x 2 x double>, <vscale x 2 x double> } @ld2q_si_f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> *%addr ) {
+; CHECK-LABEL: ld2q_si_f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0, #-16, mul vl]
+; CHECK-NEXT: ret
+ %base = getelementptr <vscale x 2 x double>, <vscale x 2 x double>* %addr, i64 -16
+ %base_ptr = bitcast <vscale x 2 x double>* %base to double *
+ %res = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld2q.sret.nxv2f64(<vscale x 2 x i1> %pg, ptr %base_ptr);
+ ret { <vscale x 2 x double>, <vscale x 2 x double> } %res
+}
+
+define { <vscale x 2 x double>, <vscale x 2 x double> } @ld2q_ss_f64(<vscale x 2 x i1> %pg, ptr %addr, i64 %a) {
+; CHECK-LABEL: ld2q_ss_f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0, x1, lsl #4]
+; CHECK-NEXT: ret
+ %addr2 = getelementptr i128, ptr %addr, i64 %a
+ %res = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld2q.sret.nxv2f64(<vscale x 2 x i1> %pg, ptr %addr2);
+ ret { <vscale x 2 x double>, <vscale x 2 x double> } %res
+}
+
+define { <vscale x 2 x double>, <vscale x 2 x double> } @ld2q_f64(<vscale x 2 x i1> %pg, ptr %addr) {
+; CHECK-LABEL: ld2q_f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %res = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld2q.sret.nxv2f64(<vscale x 2 x i1> %pg, ptr %addr);
+ ret { <vscale x 2 x double>, <vscale x 2 x double> } %res
+}
+
+define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @ld2q_si_bf16(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> *%addr ) {
+; CHECK-LABEL: ld2q_si_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0, #-16, mul vl]
+; CHECK-NEXT: ret
+ %base = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat>* %addr, i64 -16
+ %base_ptr = bitcast <vscale x 8 x bfloat>* %base to bfloat *
+ %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld2q.sret.nxv8bf16(<vscale x 8 x i1> %pg, ptr %base_ptr);
+ ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
+}
+
+define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @ld2q_ss_bf16(<vscale x 8 x i1> %pg, ptr %addr, i64 %a) {
+; CHECK-LABEL: ld2q_ss_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0, x1, lsl #4]
+; CHECK-NEXT: ret
+ %addr2 = getelementptr i128, ptr %addr, i64 %a
+ %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld2q.sret.nxv8bf16(<vscale x 8 x i1> %pg, ptr %addr2);
+ ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
+}
+
+define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @ld2q_bf16(<vscale x 8 x i1> %pg, ptr %addr) {
+; CHECK-LABEL: ld2q_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld2q.sret.nxv8bf16(<vscale x 8 x i1> %pg, ptr %addr);
+ ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
+}
+
+;; LD3Q
+define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @ld3q_si_i8_off24(<vscale x 16 x i1> %pg, <vscale x 16 x i8> *%addr ) {
+; CHECK-LABEL: ld3q_si_i8_off24:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0, #-24, mul vl]
+; CHECK-NEXT: ret
+ %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 -24
+ %base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
+ %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld3q.sret.nxv16i8(<vscale x 16 x i1> %pg, ptr %base_ptr);
+ ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
+}
+
+define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @ld3q_si_i8_off21(<vscale x 16 x i1> %pg, <vscale x 16 x i8> *%addr ) {
+; CHECK-LABEL: ld3q_si_i8_off21:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0, #21, mul vl]
+; CHECK-NEXT: ret
+ %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 21
+ %base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
+ %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld3q.sret.nxv16i8(<vscale x 16 x i1> %pg, ptr %base_ptr);
+ ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
+}
+
+define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @ld3q_ss_i8(<vscale x 16 x i1> %pg, ptr %addr, i64 %a) {
+; CHECK-LABEL: ld3q_ss_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0, x1, lsl #4]
+; CHECK-NEXT: ret
+ %addr2 = getelementptr i128, ptr %addr, i64 %a
+ %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld3q.sret.nxv16i8(<vscale x 16 x i1> %pg, ptr %addr2);
+ ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
+}
+
+define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @ld3q_i8(<vscale x 16 x i1> %pg, ptr %addr) {
+; CHECK-LABEL: ld3q_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sve.ld3q.sret.nxv16i8(<vscale x 16 x i1> %pg, ptr %addr);
+ ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} %res
+}
+
+define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @ld3q_si_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> *%addr ) {
+; CHECK-LABEL: ld3q_si_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0, #-24, mul vl]
+; CHECK-NEXT: ret
+ %base = getelementptr <vscale x 8 x i16>, <vscale x 8 x i16>* %addr, i64 -24
+ %base_ptr = bitcast <vscale x 8 x i16>* %base to i16 *
+ %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld3q.sret.nxv8i16(<vscale x 8 x i1> %pg, ptr %base_ptr);
+ ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
+}
+
+define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @ld3q_ss_i16(<vscale x 8 x i1> %pg, ptr %addr, i64 %a) {
+; CHECK-LABEL: ld3q_ss_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0, x1, lsl #4]
+; CHECK-NEXT: ret
+ %addr2 = getelementptr i128, ptr %addr, i64 %a
+ %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld3q.sret.nxv8i16(<vscale x 8 x i1> %pg, ptr %addr2);
+ ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
+}
+
+define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @ld3q_i16(<vscale x 8 x i1> %pg, ptr %addr) {
+; CHECK-LABEL: ld3q_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld3q.sret.nxv8i16(<vscale x 8 x i1> %pg, ptr %addr);
+ ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
+}
+
+define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @ld3q_si_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> *%addr ) {
+; CHECK-LABEL: ld3q_si_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0, #-24, mul vl]
+; CHECK-NEXT: ret
+ %base = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %addr, i64 -24
+ %base_ptr = bitcast <vscale x 4 x i32>* %base to i32 *
+ %res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld3q.sret.nxv4i32(<vscale x 4 x i1> %pg, ptr %base_ptr);
+ ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
+}
+
+define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @ld3q_ss_i32(<vscale x 4 x i1> %pg, ptr %addr, i64 %a) {
+; CHECK-LABEL: ld3q_ss_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0, x1, lsl #4]
+; CHECK-NEXT: ret
+ %addr2 = getelementptr i128, ptr %addr, i64 %a
+ %res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld3q.sret.nxv4i32(<vscale x 4 x i1> %pg, ptr %addr2);
+ ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
+}
+
+define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @ld3q_i32(<vscale x 4 x i1> %pg, ptr %addr) {
+; CHECK-LABEL: ld3q_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld3q.sret.nxv4i32(<vscale x 4 x i1> %pg, ptr %addr);
+ ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
+}
+
+define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @ld3q_si_i64(<vscale x 2 x i1> %pg, ptr %addr ) {
+; CHECK-LABEL: ld3q_si_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0, #-24, mul vl]
+; CHECK-NEXT: ret
+ %addr2 = getelementptr <vscale x 4 x i32>, ptr %addr, i64 -24
+ %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3q.sret.nxv2i64(<vscale x 2 x i1> %pg, ptr %addr2);
+ ret {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res
+}
+
+define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @ld3q_ss_i64(<vscale x 2 x i1> %pg, ptr %addr, i64 %a) {
+; CHECK-LABEL: ld3q_ss_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0, x1, lsl #4]
+; CHECK-NEXT: ret
+ %addr2 = getelementptr i128, ptr %addr, i64 %a
+ %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3q.sret.nxv2i64(<vscale x 2 x i1> %pg, ptr %addr2);
+ ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res
+}
+
+define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @ld3q_i64(<vscale x 2 x i1> %pg, ptr %addr) {
+; CHECK-LABEL: ld3q_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3q.sret.nxv2i64(<vscale x 2 x i1> %pg, ptr %addr);
+ ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res
+}
+
+define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @ld3q_si_f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> *%addr ) {
+; CHECK-LABEL: ld3q_si_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0, #-24, mul vl]
+; CHECK-NEXT: ret
+ %base = getelementptr <vscale x 8 x half>, <vscale x 8 x half>* %addr, i64 -24
+ %base_ptr = bitcast <vscale x 8 x half>* %base to half *
+ %res = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld3q.sret.nxv8f16(<vscale x 8 x i1> %pg, ptr %base_ptr);
+ ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res
+}
+
+define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @ld3q_ss_f16(<vscale x 8 x i1> %pg, ptr %addr, i64 %a) {
+; CHECK-LABEL: ld3q_ss_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0, x1, lsl #4]
+; CHECK-NEXT: ret
+ %addr2 = getelementptr i128, ptr %addr, i64 %a
+ %res = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld3q.sret.nxv8f16(<vscale x 8 x i1> %pg, ptr %addr2);
+ ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res
+}
+
+define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @ld3q_f16(<vscale x 8 x i1> %pg, ptr %addr) {
+; CHECK-LABEL: ld3q_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %res = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld3q.sret.nxv8f16(<vscale x 8 x i1> %pg, ptr %addr);
+ ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res
+}
+
+define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @ld3q_si_f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> *%addr ) {
+; CHECK-LABEL: ld3q_si_f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0, #-24, mul vl]
+; CHECK-NEXT: ret
+ %base = getelementptr <vscale x 4 x float>, <vscale x 4 x float>* %addr, i64 -24
+ %base_ptr = bitcast <vscale x 4 x float>* %base to float *
+ %res = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld3q.sret.nxv4f32(<vscale x 4 x i1> %pg, ptr %base_ptr);
+ ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res
+}
+
+define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @ld3q_ss_f32(<vscale x 4 x i1> %pg, ptr %addr, i64 %a) {
+; CHECK-LABEL: ld3q_ss_f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0, x1, lsl #4]
+; CHECK-NEXT: ret
+ %addr2 = getelementptr i128, ptr %addr, i64 %a
+ %res = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld3q.sret.nxv4f32(<vscale x 4 x i1> %pg, ptr %addr2);
+ ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res
+}
+
+define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @ld3q_f32(<vscale x 4 x i1> %pg, ptr %addr) {
+; CHECK-LABEL: ld3q_f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %res = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld3q.sret.nxv4f32(<vscale x 4 x i1> %pg, ptr %addr);
+ ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res
+}
+
+define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @ld3q_si_f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> *%addr ) {
+; CHECK-LABEL: ld3q_si_f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0, #-24, mul vl]
+; CHECK-NEXT: ret
+ %base = getelementptr <vscale x 2 x double>, <vscale x 2 x double>* %addr, i64 -24
+ %base_ptr = bitcast <vscale x 2 x double>* %base to double *
+ %res = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld3q.sret.nxv2f64(<vscale x 2 x i1> %pg, ptr %base_ptr);
+ ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %res
+}
+
+define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @ld3q_ss_f64(<vscale x 2 x i1> %pg, ptr %addr, i64 %a) {
+; CHECK-LABEL: ld3q_ss_f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0, x1, lsl #4]
+; CHECK-NEXT: ret
+ %addr2 = getelementptr i128, ptr %addr, i64 %a
+ %res = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld3q.sret.nxv2f64(<vscale x 2 x i1> %pg, ptr %addr2);
+ ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %res
+}
+
+define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @ld3q_f64(<vscale x 2 x i1> %pg, ptr %addr) {
+; CHECK-LABEL: ld3q_f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %res = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld3q.sret.nxv2f64(<vscale x 2 x i1> %pg, ptr %addr);
+ ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %res
+}
+
+define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @ld3q_si_bf16(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> *%addr ) {
+; CHECK-LABEL: ld3q_si_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0, #-24, mul vl]
+; CHECK-NEXT: ret
+ %base = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat>* %addr, i64 -24
+ %base_ptr = bitcast <vscale x 8 x bfloat>* %base to bfloat *
+ %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld3q.sret.nxv8bf16(<vscale x 8 x i1> %pg, ptr %base_ptr);
+ ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
+}
+
+define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @ld3q_ss_bf16(<vscale x 8 x i1> %pg, ptr %addr, i64 %a) {
+; CHECK-LABEL: ld3q_ss_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0, x1, lsl #4]
+; CHECK-NEXT: ret
+ %addr2 = getelementptr i128, ptr %addr, i64 %a
+ %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld3q.sret.nxv8bf16(<vscale x 8 x i1> %pg, ptr %addr2);
+ ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
+}
+
+define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @ld3q_bf16(<vscale x 8 x i1> %pg, ptr %addr) {
+; CHECK-LABEL: ld3q_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld3q.sret.nxv8bf16(<vscale x 8 x i1> %pg, ptr %addr);
+ ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
+}
+
+;; LD4Q
+define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @ld4q_si_i8_off32(<vscale x 16 x i1> %pg, <vscale x 16 x i8> *%addr ) {
+; CHECK-LABEL: ld4q_si_i8_off32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0, #-32, mul vl]
+; CHECK-NEXT: ret
+ %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 -32
+ %base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
+ %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld4q.sret.nxv16i8(<vscale x 16 x i1> %pg, ptr %base_ptr);
+ ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
+}
+
+define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @ld4q_si_i8_off28(<vscale x 16 x i1> %pg, <vscale x 16 x i8> *%addr ) {
+; CHECK-LABEL: ld4q_si_i8_off28:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0, #28, mul vl]
+; CHECK-NEXT: ret
+ %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 28
+ %base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
+ %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld4q.sret.nxv16i8(<vscale x 16 x i1> %pg, ptr %base_ptr);
+ ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
+}
+
+define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @ld4q_ss_i8(<vscale x 16 x i1> %pg, ptr %addr, i64 %a) {
+; CHECK-LABEL: ld4q_ss_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0, x1, lsl #4]
+; CHECK-NEXT: ret
+ %addr2 = getelementptr i128, ptr %addr, i64 %a
+ %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld4q.sret.nxv16i8(<vscale x 16 x i1> %pg, ptr %addr2);
+ ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
+}
+
+define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @ld4q_i8(<vscale x 16 x i1> %pg, ptr %addr) {
+; CHECK-LABEL: ld4q_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sve.ld4q.sret.nxv16i8(<vscale x 16 x i1> %pg, ptr %addr);
+ ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} %res
+}
+
+define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @ld4q_si_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> *%addr ) {
+; CHECK-LABEL: ld4q_si_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0, #-32, mul vl]
+; CHECK-NEXT: ret
+ %base = getelementptr <vscale x 8 x i16>, <vscale x 8 x i16>* %addr, i64 -32
+ %base_ptr = bitcast <vscale x 8 x i16>* %base to i16 *
+ %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld4q.sret.nxv8i16(<vscale x 8 x i1> %pg, ptr %base_ptr);
+ ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
+}
+
+define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @ld4q_ss_i16(<vscale x 8 x i1> %pg, ptr %addr, i64 %a) {
+; CHECK-LABEL: ld4q_ss_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0, x1, lsl #4]
+; CHECK-NEXT: ret
+ %addr2 = getelementptr i128, ptr %addr, i64 %a
+ %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld4q.sret.nxv8i16(<vscale x 8 x i1> %pg, ptr %addr2);
+ ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
+}
+
+define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @ld4q_i16(<vscale x 8 x i1> %pg, ptr %addr) {
+; CHECK-LABEL: ld4q_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld4q.sret.nxv8i16(<vscale x 8 x i1> %pg, ptr %addr);
+ ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
+}
+
+define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @ld4q_si_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> *%addr ) {
+; CHECK-LABEL: ld4q_si_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0, #-32, mul vl]
+; CHECK-NEXT: ret
+ %base = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %addr, i64 -32
+ %base_ptr = bitcast <vscale x 4 x i32>* %base to i32 *
+ %res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4q.sret.nxv4i32(<vscale x 4 x i1> %pg, ptr %base_ptr);
+ ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
+}
+
+define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @ld4q_ss_i32(<vscale x 4 x i1> %pg, ptr %addr, i64 %a) {
+; CHECK-LABEL: ld4q_ss_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0, x1, lsl #4]
+; CHECK-NEXT: ret
+ %addr2 = getelementptr i128, ptr %addr, i64 %a
+ %res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4q.sret.nxv4i32(<vscale x 4 x i1> %pg, ptr %addr2);
+ ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
+}
+
+define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @ld4q_i32(<vscale x 4 x i1> %pg, ptr %addr) {
+; CHECK-LABEL: ld4q_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4q.sret.nxv4i32(<vscale x 4 x i1> %pg, ptr %addr);
+ ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
+}
+
+define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @ld4q_si_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> *%addr ) {
+; CHECK-LABEL: ld4q_si_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0, #-32, mul vl]
+; CHECK-NEXT: ret
+ %base = getelementptr <vscale x 2 x i64>, <vscale x 2 x i64>* %addr, i64 -32
+ %base_ptr = bitcast <vscale x 2 x i64>* %base to i64 *
+ %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld4q.sret.nxv2i64(<vscale x 2 x i1> %pg, ptr %base_ptr);
+ ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res
+}
+
+define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @ld4q_ss_i64(<vscale x 2 x i1> %pg, ptr %addr, i64 %a) {
+; CHECK-LABEL: ld4q_ss_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0, x1, lsl #4]
+; CHECK-NEXT: ret
+ %addr2 = getelementptr i128, ptr %addr, i64 %a
+ %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld4q.sret.nxv2i64(<vscale x 2 x i1> %pg, ptr %addr2);
+ ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res
+}
+
+define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @ld4q_i64(<vscale x 2 x i1> %pg, ptr %addr) {
+; CHECK-LABEL: ld4q_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld4q.sret.nxv2i64(<vscale x 2 x i1> %pg, ptr %addr);
+ ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res
+}
+
+define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @ld4q_si_f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> *%addr ) {
+; CHECK-LABEL: ld4q_si_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0, #-32, mul vl]
+; CHECK-NEXT: ret
+ %base = getelementptr <vscale x 8 x half>, <vscale x 8 x half>* %addr, i64 -32
+ %base_ptr = bitcast <vscale x 8 x half>* %base to half *
+ %res = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld4q.sret.nxv8f16(<vscale x 8 x i1> %pg, ptr %base_ptr);
+ ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res
+}
+
+define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @ld4q_ss_f16(<vscale x 8 x i1> %pg, ptr %addr, i64 %a) {
+; CHECK-LABEL: ld4q_ss_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0, x1, lsl #4]
+; CHECK-NEXT: ret
+ %addr2 = getelementptr i128, ptr %addr, i64 %a
+ %res = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld4q.sret.nxv8f16(<vscale x 8 x i1> %pg, ptr %addr2);
+ ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res
+}
+
+define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @ld4q_f16(<vscale x 8 x i1> %pg, ptr %addr) {
+; CHECK-LABEL: ld4q_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %res = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld4q.sret.nxv8f16(<vscale x 8 x i1> %pg, ptr %addr);
+ ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res
+}
+
+define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @ld4q_si_f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> *%addr ) {
+; CHECK-LABEL: ld4q_si_f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0, #-32, mul vl]
+; CHECK-NEXT: ret
+ %base = getelementptr <vscale x 4 x float>, <vscale x 4 x float>* %addr, i64 -32
+ %base_ptr = bitcast <vscale x 4 x float>* %base to float *
+ %res = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld4q.sret.nxv4f32(<vscale x 4 x i1> %pg, ptr %base_ptr);
+ ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res
+}
+
+define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @ld4q_ss_f32(<vscale x 4 x i1> %pg, ptr %addr, i64 %a) {
+; CHECK-LABEL: ld4q_ss_f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0, x1, lsl #4]
+; CHECK-NEXT: ret
+ %addr2 = getelementptr i128, ptr %addr, i64 %a
+ %res = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld4q.sret.nxv4f32(<vscale x 4 x i1> %pg, ptr %addr2);
+ ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res
+}
+
+define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @ld4q_f32(<vscale x 4 x i1> %pg, ptr %addr) {
+; CHECK-LABEL: ld4q_f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %res = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld4q.sret.nxv4f32(<vscale x 4 x i1> %pg, ptr %addr);
+ ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res
+}
+
+define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @ld4q_si_f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> *%addr ) {
+; CHECK-LABEL: ld4q_si_f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0, #-32, mul vl]
+; CHECK-NEXT: ret
+ %base = getelementptr <vscale x 2 x double>, <vscale x 2 x double>* %addr, i64 -32
+ %base_ptr = bitcast <vscale x 2 x double>* %base to double *
+ %res = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld4q.sret.nxv2f64(<vscale x 2 x i1> %pg, ptr %base_ptr);
+ ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %res
+}
+
+define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @ld4q_ss_f64(<vscale x 2 x i1> %pg, ptr %addr, i64 %a) {
+; CHECK-LABEL: ld4q_ss_f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0, x1, lsl #4]
+; CHECK-NEXT: ret
+ %addr2 = getelementptr i128, ptr %addr, i64 %a
+ %res = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld4q.sret.nxv2f64(<vscale x 2 x i1> %pg, ptr %addr2);
+ ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %res
+}
+
+define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @ld4q_f64(<vscale x 2 x i1> %pg, ptr %addr) {
+; CHECK-LABEL: ld4q_f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %res = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld4q.sret.nxv2f64(<vscale x 2 x i1> %pg, ptr %addr);
+ ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %res
+}
+
+define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @ld4q_si_bf16(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> *%addr ) {
+; CHECK-LABEL: ld4q_si_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0, #-32, mul vl]
+; CHECK-NEXT: ret
+ %base = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat>* %addr, i64 -32
+ %base_ptr = bitcast <vscale x 8 x bfloat>* %base to bfloat *
+ %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld4q.sret.nxv8bf16(<vscale x 8 x i1> %pg, ptr %base_ptr);
+ ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
+}
+
+define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @ld4q_ss_bf16(<vscale x 8 x i1> %pg, ptr %addr, i64 %a) {
+; CHECK-LABEL: ld4q_ss_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0, x1, lsl #4]
+; CHECK-NEXT: ret
+ %addr2 = getelementptr i128, ptr %addr, i64 %a
+ %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld4q.sret.nxv8bf16(<vscale x 8 x i1> %pg, ptr %addr2);
+ ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
+}
+
+define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @ld4q_bf16(<vscale x 8 x i1> %pg, ptr %addr) {
+; CHECK-LABEL: ld4q_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld4q.sret.nxv8bf16(<vscale x 8 x i1> %pg, ptr %addr);
+ ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
+}
+
+
+declare { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2i64(target("aarch64.svcount"), ptr)
+declare { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4i32(target("aarch64.svcount"), ptr)
+declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8i16(target("aarch64.svcount"), ptr)
+declare { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv16i8(target("aarch64.svcount"), ptr)
+declare { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2f64(target("aarch64.svcount"), ptr)
+declare { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4f32(target("aarch64.svcount"), ptr)
+declare { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8f16(target("aarch64.svcount"), ptr)
+declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8bf16(target("aarch64.svcount"), ptr)
+
+declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld2q.sret.nxv16i8(<vscale x 16 x i1>, ptr)
+declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld2q.sret.nxv8i16(<vscale x 8 x i1>, ptr)
+declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2q.sret.nxv4i32(<vscale x 4 x i1>, ptr)
+declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2q.sret.nxv2i64(<vscale x 2 x i1>, ptr)
+
+declare { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld2q.sret.nxv8f16(<vscale x 8 x i1>, ptr)
+declare { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld2q.sret.nxv4f32(<vscale x 4 x i1>, ptr)
+declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld2q.sret.nxv2f64(<vscale x 2 x i1>, ptr)
+declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld2q.sret.nxv8bf16(<vscale x 8 x i1>, ptr)
+
+declare { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld3q.sret.nxv16i8(<vscale x 16 x i1>, ptr)
+declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld3q.sret.nxv8i16(<vscale x 8 x i1>, ptr)
+declare { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld3q.sret.nxv4i32(<vscale x 4 x i1>, ptr)
+declare { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3q.sret.nxv2i64(<vscale x 2 x i1>, ptr)
+
+declare { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld3q.sret.nxv8f16(<vscale x 8 x i1>, ptr)
+declare { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld3q.sret.nxv4f32(<vscale x 4 x i1>, ptr)
+declare { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld3q.sret.nxv2f64(<vscale x 2 x i1>, ptr)
+declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld3q.sret.nxv8bf16(<vscale x 8 x i1>, ptr)
+
+declare { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld4q.sret.nxv16i8(<vscale x 16 x i1>, ptr)
+declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld4q.sret.nxv8i16(<vscale x 8 x i1>, ptr)
+declare { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4q.sret.nxv4i32(<vscale x 4 x i1>, ptr)
+declare { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld4q.sret.nxv2i64(<vscale x 2 x i1>, ptr)
+
+declare { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld4q.sret.nxv8f16(<vscale x 8 x i1>, ptr)
+declare { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld4q.sret.nxv4f32(<vscale x 4 x i1>, ptr)
+declare { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld4q.sret.nxv2f64(<vscale x 2 x i1>, ptr)
+declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld4q.sret.nxv8bf16(<vscale x 8 x i1>, ptr)
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-multivec-stores.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-multivec-stores.ll
new file mode 100644
index 000000000000000..48ec0161cb8b8ce
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-multivec-stores.ll
@@ -0,0 +1,910 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1,+bf16 < %s | FileCheck %s
+
+;
+; ST2Q
+;
+define void @st2q_ss_i8(<vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1, <vscale x 16 x i1> %pred, ptr %addr, i64 %offset) {
+; CHECK-LABEL: st2q_ss_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: st2q { z0.q, z1.q }, p0, [x0, x1, lsl #4]
+; CHECK-NEXT: ret
+ %1 = getelementptr i128, ptr %addr, i64 %offset
+ call void @llvm.aarch64.sve.st2q.nxv16i8(<vscale x 16 x i8>%v0, <vscale x 16 x i8> %v1 ,
+ <vscale x 16 x i1> %pred,
+ ptr %1)
+ ret void
+}
+
+define void @st2q_ss_i16(<vscale x 8 x i16> %v0, <vscale x 8 x i16> %v1, <vscale x 8 x i1> %pred, ptr %addr, i64 %offset) {
+; CHECK-LABEL: st2q_ss_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: st2q { z0.q, z1.q }, p0, [x0, x1, lsl #4]
+; CHECK-NEXT: ret
+ %1 = getelementptr i128, ptr %addr, i64 %offset
+ call void @llvm.aarch64.sve.st2q.nxv8i16(<vscale x 8 x i16> %v0,
+ <vscale x 8 x i16> %v1,
+ <vscale x 8 x i1> %pred,
+ ptr %1)
+ ret void
+}
+
+define void @st2q_ss_i32(<vscale x 4 x i32> %v0, <vscale x 4 x i32> %v1, <vscale x 4 x i1> %pred, ptr %addr, i64 %offset) {
+; CHECK-LABEL: st2q_ss_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: st2q { z0.q, z1.q }, p0, [x0, x1, lsl #4]
+; CHECK-NEXT: ret
+ %1 = getelementptr i128, ptr %addr, i64 %offset
+ call void @llvm.aarch64.sve.st2q.nxv4i32(<vscale x 4 x i32> %v0,
+ <vscale x 4 x i32> %v1,
+ <vscale x 4 x i1> %pred,
+ ptr %1)
+ ret void
+}
+
+define void @st2q_ss_i64(<vscale x 2 x i64> %v0, <vscale x 2 x i64> %v1, <vscale x 2 x i1> %pred, ptr %addr, i64 %offset) {
+; CHECK-LABEL: st2q_ss_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: st2q { z0.q, z1.q }, p0, [x0, x1, lsl #4]
+; CHECK-NEXT: ret
+ %1 = getelementptr i128, ptr %addr, i64 %offset
+ call void @llvm.aarch64.sve.st2q.nxv2i64(<vscale x 2 x i64> %v0,
+ <vscale x 2 x i64> %v1,
+ <vscale x 2 x i1> %pred,
+ ptr %1)
+ ret void
+}
+
+define void @st2q_ss_f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale x 8 x i1> %pred, ptr %addr, i64 %offset) {
+; CHECK-LABEL: st2q_ss_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: st2q { z0.q, z1.q }, p0, [x0, x1, lsl #4]
+; CHECK-NEXT: ret
+ %1 = getelementptr i128, ptr %addr, i64 %offset
+ call void @llvm.aarch64.sve.st2q.nxv8f16(<vscale x 8 x half> %v0,
+ <vscale x 8 x half> %v1,
+ <vscale x 8 x i1> %pred,
+ ptr %1)
+ ret void
+}
+
+define void @st2q_ss_f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscale x 4 x i1> %pred, ptr %addr, i64 %offset) {
+; CHECK-LABEL: st2q_ss_f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: st2q { z0.q, z1.q }, p0, [x0, x1, lsl #4]
+; CHECK-NEXT: ret
+ %1 = getelementptr i128, ptr %addr, i64 %offset
+ call void @llvm.aarch64.sve.st2q.nxv4f32(<vscale x 4 x float> %v0,
+ <vscale x 4 x float> %v1,
+ <vscale x 4 x i1> %pred,
+ ptr %1)
+ ret void
+}
+
+define void @st2q_ss_f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vscale x 2 x i1> %pred, ptr %addr, i64 %offset) {
+; CHECK-LABEL: st2q_ss_f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: st2q { z0.q, z1.q }, p0, [x0, x1, lsl #4]
+; CHECK-NEXT: ret
+ %1 = getelementptr i128, ptr %addr, i64 %offset
+ call void @llvm.aarch64.sve.st2q.nxv2f64(<vscale x 2 x double> %v0,
+ <vscale x 2 x double> %v1,
+ <vscale x 2 x i1> %pred,
+ ptr %1)
+ ret void
+}
+
+define void @st2q_ss_bf16(<vscale x 8 x bfloat> %v0, <vscale x 8 x bfloat> %v1, <vscale x 8 x i1> %pred, ptr %addr, i64 %offset) {
+; CHECK-LABEL: st2q_ss_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: st2q { z0.q, z1.q }, p0, [x0, x1, lsl #4]
+; CHECK-NEXT: ret
+ %1 = getelementptr i128, ptr %addr, i64 %offset
+ call void @llvm.aarch64.sve.st2q.nxv8bf16(<vscale x 8 x bfloat> %v0,
+ <vscale x 8 x bfloat> %v1,
+ <vscale x 8 x i1> %pred,
+ ptr %1)
+ ret void
+}
+
+
+define void @st2q_si_i8_off16(<vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1, <vscale x 16 x i1> %pred, <vscale x 16 x i8>* %addr) {
+; CHECK-LABEL: st2q_si_i8_off16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: st2q { z0.q, z1.q }, p0, [x0, #-16, mul vl]
+; CHECK-NEXT: ret
+ %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 -16
+ call void @llvm.aarch64.sve.st2q.nxv16i8(<vscale x 16 x i8> %v0,
+ <vscale x 16 x i8> %v1,
+ <vscale x 16 x i1> %pred,
+ i8* %base)
+ ret void
+}
+
+define void @st2q_si_i8_off14(<vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1, <vscale x 16 x i1> %pred, <vscale x 16 x i8>* %addr) {
+; CHECK-LABEL: st2q_si_i8_off14:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: st2q { z0.q, z1.q }, p0, [x0, #14, mul vl]
+; CHECK-NEXT: ret
+ %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 14
+ call void @llvm.aarch64.sve.st2q.nxv16i8(<vscale x 16 x i8> %v0,
+ <vscale x 16 x i8> %v1,
+ <vscale x 16 x i1> %pred,
+ i8* %base)
+ ret void
+}
+
+define void @st2q_si_i16(<vscale x 8 x i16> %v0, <vscale x 8 x i16> %v1, <vscale x 8 x i1> %pred, ptr %base) {
+; CHECK-LABEL: st2q_si_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: st2q { z0.q, z1.q }, p0, [x0, #14, mul vl]
+; CHECK-NEXT: ret
+ %gep = getelementptr <vscale x 8 x i16>, ptr %base, i64 14
+ call void @llvm.aarch64.sve.st2q.nxv8i16(<vscale x 8 x i16> %v0,
+ <vscale x 8 x i16> %v1,
+ <vscale x 8 x i1> %pred,
+ i8* %gep)
+ ret void
+}
+
+define void @st2q_si_i32(<vscale x 4 x i32> %v0, <vscale x 4 x i32> %v1, <vscale x 4 x i1> %pred, ptr %base) {
+; CHECK-LABEL: st2q_si_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: st2q { z0.q, z1.q }, p0, [x0, #14, mul vl]
+; CHECK-NEXT: ret
+ %gep = getelementptr <vscale x 4 x i32>, ptr %base, i64 14
+ call void @llvm.aarch64.sve.st2q.nxv4i32(<vscale x 4 x i32> %v0,
+ <vscale x 4 x i32> %v1,
+ <vscale x 4 x i1> %pred,
+ i32* %gep)
+ ret void
+}
+
+define void @st2q_si_i64(<vscale x 2 x i64> %v0, <vscale x 2 x i64> %v1, <vscale x 2 x i1> %pred, ptr %base) {
+; CHECK-LABEL: st2q_si_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: st2q { z0.q, z1.q }, p0, [x0, #14, mul vl]
+; CHECK-NEXT: ret
+ %gep = getelementptr <vscale x 2 x i64>, ptr %base, i64 14
+ call void @llvm.aarch64.sve.st2q.nxv2i64(<vscale x 2 x i64> %v0,
+ <vscale x 2 x i64> %v1,
+ <vscale x 2 x i1> %pred,
+ i64* %gep)
+ ret void
+}
+
+define void @st2q_si_f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale x 8 x i1> %pred, ptr %base) {
+; CHECK-LABEL: st2q_si_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: st2q { z0.q, z1.q }, p0, [x0, #14, mul vl]
+; CHECK-NEXT: ret
+ %gep = getelementptr <vscale x 8 x half>, ptr %base, i64 14
+ call void @llvm.aarch64.sve.st2q.nxv8f16(<vscale x 8 x half> %v0,
+ <vscale x 8 x half> %v1,
+ <vscale x 8 x i1> %pred,
+ half* %gep)
+ ret void
+}
+
+define void @st2q_si_f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscale x 4 x i1> %pred, ptr %base) {
+; CHECK-LABEL: st2q_si_f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: st2q { z0.q, z1.q }, p0, [x0, #14, mul vl]
+; CHECK-NEXT: ret
+ %gep = getelementptr <vscale x 4 x float>, ptr %base, i64 14
+ call void @llvm.aarch64.sve.st2q.nxv4f32(<vscale x 4 x float> %v0,
+ <vscale x 4 x float> %v1,
+ <vscale x 4 x i1> %pred,
+ float* %gep)
+ ret void
+}
+
+define void @st2q_si_f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vscale x 2 x i1> %pred, ptr %base) {
+; CHECK-LABEL: st2q_si_f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: st2q { z0.q, z1.q }, p0, [x0, #14, mul vl]
+; CHECK-NEXT: ret
+ %gep= getelementptr <vscale x 2 x double>, ptr %base, i64 14
+ call void @llvm.aarch64.sve.st2q.nxv2f64(<vscale x 2 x double> %v0,
+ <vscale x 2 x double> %v1,
+ <vscale x 2 x i1> %pred,
+ double* %gep)
+ ret void
+}
+
+define void @st2q_si_bf16(<vscale x 8 x bfloat> %v0, <vscale x 8 x bfloat> %v1, <vscale x 8 x i1> %pred, ptr %base) {
+; CHECK-LABEL: st2q_si_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: st2q { z0.q, z1.q }, p0, [x0, #14, mul vl]
+; CHECK-NEXT: ret
+ %gep = getelementptr <vscale x 8 x bfloat>, ptr %base, i64 14
+ call void @llvm.aarch64.sve.st2q.nxv8bf16(<vscale x 8 x bfloat> %v0,
+ <vscale x 8 x bfloat> %v1,
+ <vscale x 8 x i1> %pred,
+ bfloat* %gep)
+ ret void
+}
+
+
+;
+; ST3Q
+;
+define void @st3q_ss_i8(<vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1, <vscale x 16 x i8> %v2, <vscale x 16 x i1> %pred, ptr %addr, i64 %offset) {
+; CHECK-LABEL: st3q_ss_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: st3q { z0.q - z2.q }, p0, [x0, x1, lsl #4]
+; CHECK-NEXT: ret
+ %1 = getelementptr i128, ptr %addr, i64 %offset
+ call void @llvm.aarch64.sve.st3q.nxv16i8(<vscale x 16 x i8>%v0,
+ <vscale x 16 x i8> %v1,
+ <vscale x 16 x i8> %v2,
+ <vscale x 16 x i1> %pred,
+ ptr %1)
+ ret void
+}
+
+define void @st3q_ss_i16(<vscale x 8 x i16> %v0, <vscale x 8 x i16> %v1, <vscale x 8 x i16> %v2, <vscale x 8 x i1> %pred, ptr %addr, i64 %offset) {
+; CHECK-LABEL: st3q_ss_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: st3q { z0.q - z2.q }, p0, [x0, x1, lsl #4]
+; CHECK-NEXT: ret
+ %1 = getelementptr i128, ptr %addr, i64 %offset
+ call void @llvm.aarch64.sve.st3q.nxv8i16(<vscale x 8 x i16> %v0,
+ <vscale x 8 x i16> %v1,
+ <vscale x 8 x i16> %v2,
+ <vscale x 8 x i1> %pred,
+ ptr %1)
+ ret void
+}
+
+define void @st3q_ss_i32(<vscale x 4 x i32> %v0, <vscale x 4 x i32> %v1, <vscale x 4 x i32> %v2, <vscale x 4 x i1> %pred, ptr %addr, i64 %offset) {
+; CHECK-LABEL: st3q_ss_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: st3q { z0.q - z2.q }, p0, [x0, x1, lsl #4]
+; CHECK-NEXT: ret
+ %1 = getelementptr i128, ptr %addr, i64 %offset
+ call void @llvm.aarch64.sve.st3q.nxv4i32(<vscale x 4 x i32> %v0,
+ <vscale x 4 x i32> %v1,
+ <vscale x 4 x i32> %v2,
+ <vscale x 4 x i1> %pred,
+ ptr %1)
+ ret void
+}
+
+define void @st3q_ss_i64(<vscale x 2 x i64> %v0, <vscale x 2 x i64> %v1, <vscale x 2 x i64> %v2, <vscale x 2 x i1> %pred, ptr %addr, i64 %offset) {
+; CHECK-LABEL: st3q_ss_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: st3q { z0.q - z2.q }, p0, [x0, x1, lsl #4]
+; CHECK-NEXT: ret
+ %1 = getelementptr i128, ptr %addr, i64 %offset
+ call void @llvm.aarch64.sve.st3q.nxv2i64(<vscale x 2 x i64> %v0,
+ <vscale x 2 x i64> %v1,
+ <vscale x 2 x i64> %v2,
+ <vscale x 2 x i1> %pred,
+ ptr %1)
+ ret void
+}
+
+define void @st3q_ss_f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale x 8 x half> %v2, <vscale x 8 x i1> %pred, ptr %addr, i64 %offset) {
+; CHECK-LABEL: st3q_ss_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: st3q { z0.q - z2.q }, p0, [x0, x1, lsl #4]
+; CHECK-NEXT: ret
+ %1 = getelementptr i128, ptr %addr, i64 %offset
+ call void @llvm.aarch64.sve.st3q.nxv8f16(<vscale x 8 x half> %v0,
+ <vscale x 8 x half> %v1,
+ <vscale x 8 x half> %v2,
+ <vscale x 8 x i1> %pred,
+ ptr %1)
+ ret void
+}
+
+define void @st3q_ss_f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscale x 4 x float> %v2, <vscale x 4 x i1> %pred, ptr %addr, i64 %offset) {
+; CHECK-LABEL: st3q_ss_f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: st3q { z0.q - z2.q }, p0, [x0, x1, lsl #4]
+; CHECK-NEXT: ret
+ %1 = getelementptr i128, ptr %addr, i64 %offset
+ call void @llvm.aarch64.sve.st3q.nxv4f32(<vscale x 4 x float> %v0,
+ <vscale x 4 x float> %v1,
+ <vscale x 4 x float> %v2,
+ <vscale x 4 x i1> %pred,
+ ptr %1)
+ ret void
+}
+
+define void @st3q_ss_f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vscale x 2 x double> %v2, <vscale x 2 x i1> %pred, ptr %addr, i64 %offset) {
+; CHECK-LABEL: st3q_ss_f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: st3q { z0.q - z2.q }, p0, [x0, x1, lsl #4]
+; CHECK-NEXT: ret
+ %1 = getelementptr i128, ptr %addr, i64 %offset
+ call void @llvm.aarch64.sve.st3q.nxv2f64(<vscale x 2 x double> %v0,
+ <vscale x 2 x double> %v1,
+ <vscale x 2 x double> %v2,
+ <vscale x 2 x i1> %pred,
+ ptr %1)
+ ret void
+}
+
+define void @st3q_ss_bf16(<vscale x 8 x bfloat> %v0, <vscale x 8 x bfloat> %v1, <vscale x 8 x bfloat> %v2, <vscale x 8 x i1> %pred, ptr %addr, i64 %offset) {
+; CHECK-LABEL: st3q_ss_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: st3q { z0.q - z2.q }, p0, [x0, x1, lsl #4]
+; CHECK-NEXT: ret
+ %1 = getelementptr i128, ptr %addr, i64 %offset
+ call void @llvm.aarch64.sve.st3q.nxv8bf16(<vscale x 8 x bfloat> %v0,
+ <vscale x 8 x bfloat> %v1,
+ <vscale x 8 x bfloat> %v2,
+ <vscale x 8 x i1> %pred,
+ ptr %1)
+ ret void
+}
+
+define void @st3q_si_i8_off24(<vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1, <vscale x 16 x i8> %v2, <vscale x 16 x i1> %pred, <vscale x 16 x i8>* %addr) {
+; CHECK-LABEL: st3q_si_i8_off24:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: st3q { z0.q - z2.q }, p0, [x0, #-24, mul vl]
+; CHECK-NEXT: ret
+ %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 -24
+ call void @llvm.aarch64.sve.st3q.nxv16i8(<vscale x 16 x i8> %v0,
+ <vscale x 16 x i8> %v1,
+ <vscale x 16 x i8> %v2,
+ <vscale x 16 x i1> %pred,
+ i8* %base)
+ ret void
+}
+
+define void @st3q_si_i8_off21(<vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1, <vscale x 16 x i8> %v2, <vscale x 16 x i1> %pred, <vscale x 16 x i8>* %addr) {
+; CHECK-LABEL: st3q_si_i8_off21:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: st3q { z0.q - z2.q }, p0, [x0, #21, mul vl]
+; CHECK-NEXT: ret
+ %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 21
+ call void @llvm.aarch64.sve.st3q.nxv16i8(<vscale x 16 x i8> %v0,
+ <vscale x 16 x i8> %v1,
+ <vscale x 16 x i8> %v2,
+ <vscale x 16 x i1> %pred,
+ i8* %base)
+ ret void
+}
+
+define void @st3q_si_i16(<vscale x 8 x i16> %v0, <vscale x 8 x i16> %v1, <vscale x 8 x i16> %v2, <vscale x 8 x i1> %pred, <vscale x 8 x i16>* %addr) {
+; CHECK-LABEL: st3q_si_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: st3q { z0.q - z2.q }, p0, [x0, #21, mul vl]
+; CHECK-NEXT: ret
+ %base = getelementptr <vscale x 8 x i16>, <vscale x 8 x i16>* %addr, i64 21
+ call void @llvm.aarch64.sve.st3q.nxv8i16(<vscale x 8 x i16> %v0,
+ <vscale x 8 x i16> %v1,
+ <vscale x 8 x i16> %v2,
+ <vscale x 8 x i1> %pred,
+ i8* %base)
+ ret void
+}
+
+define void @st3q_si_i32(<vscale x 4 x i32> %v0, <vscale x 4 x i32> %v1, <vscale x 4 x i32> %v2, <vscale x 4 x i1> %pred, <vscale x 4 x i32>* %addr) {
+; CHECK-LABEL: st3q_si_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: st3q { z0.q - z2.q }, p0, [x0, #21, mul vl]
+; CHECK-NEXT: ret
+ %base = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %addr, i64 21
+ call void @llvm.aarch64.sve.st3q.nxv4i32(<vscale x 4 x i32> %v0,
+ <vscale x 4 x i32> %v1,
+ <vscale x 4 x i32> %v2,
+ <vscale x 4 x i1> %pred,
+ i32* %base)
+ ret void
+}
+
+define void @st3q_si_i64(<vscale x 2 x i64> %v0, <vscale x 2 x i64> %v1,<vscale x 2 x i64> %v2, <vscale x 2 x i1> %pred, <vscale x 2 x i64>* %addr) {
+; CHECK-LABEL: st3q_si_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: st3q { z0.q - z2.q }, p0, [x0, #21, mul vl]
+; CHECK-NEXT: ret
+ %base = getelementptr <vscale x 2 x i64>, <vscale x 2 x i64>* %addr, i64 21
+ call void @llvm.aarch64.sve.st3q.nxv2i64(<vscale x 2 x i64> %v0,
+ <vscale x 2 x i64> %v1,
+ <vscale x 2 x i64> %v2,
+ <vscale x 2 x i1> %pred,
+ i64* %base)
+ ret void
+}
+
+define void @st3q_si_f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale x 8 x half> %v2, <vscale x 8 x i1> %pred, <vscale x 8 x half>* %addr) {
+; CHECK-LABEL: st3q_si_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: st3q { z0.q - z2.q }, p0, [x0, #21, mul vl]
+; CHECK-NEXT: ret
+ %base = getelementptr <vscale x 8 x half>, <vscale x 8 x half>* %addr, i64 21
+ call void @llvm.aarch64.sve.st3q.nxv8f16(<vscale x 8 x half> %v0,
+ <vscale x 8 x half> %v1,
+ <vscale x 8 x half> %v2,
+ <vscale x 8 x i1> %pred,
+ half* %base)
+ ret void
+}
+
+define void @st3q_si_f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscale x 4 x float> %v2, <vscale x 4 x i1> %pred, <vscale x 4 x float>* %addr) {
+; CHECK-LABEL: st3q_si_f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: st3q { z0.q - z2.q }, p0, [x0, #21, mul vl]
+; CHECK-NEXT: ret
+ %base = getelementptr <vscale x 4 x float>, <vscale x 4 x float>* %addr, i64 21
+ call void @llvm.aarch64.sve.st3q.nxv4f32(<vscale x 4 x float> %v0,
+ <vscale x 4 x float> %v1,
+ <vscale x 4 x float> %v2,
+ <vscale x 4 x i1> %pred,
+ float* %base)
+ ret void
+}
+
+define void @st3q_si_f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vscale x 2 x double> %v2, <vscale x 2 x i1> %pred, <vscale x 2 x double>* %addr) {
+; CHECK-LABEL: st3q_si_f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: st3q { z0.q - z2.q }, p0, [x0, #21, mul vl]
+; CHECK-NEXT: ret
+ %base = getelementptr <vscale x 2 x double>, <vscale x 2 x double>* %addr, i64 21
+ call void @llvm.aarch64.sve.st3q.nxv2f64(<vscale x 2 x double> %v0,
+ <vscale x 2 x double> %v1,
+ <vscale x 2 x double> %v2,
+ <vscale x 2 x i1> %pred,
+ double* %base)
+ ret void
+}
+
+define void @st3q_si_bf16(<vscale x 8 x bfloat> %v0, <vscale x 8 x bfloat> %v1, <vscale x 8 x bfloat> %v2, <vscale x 8 x i1> %pred, <vscale x 8 x bfloat>* %addr) {
+; CHECK-LABEL: st3q_si_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: st3q { z0.q - z2.q }, p0, [x0, #21, mul vl]
+; CHECK-NEXT: ret
+ %base = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat>* %addr, i64 21
+ call void @llvm.aarch64.sve.st3q.nxv8bf16(<vscale x 8 x bfloat> %v0,
+ <vscale x 8 x bfloat> %v1,
+ <vscale x 8 x bfloat> %v2,
+ <vscale x 8 x i1> %pred,
+ bfloat* %base)
+ ret void
+}
+
+;
+; ST4Q
+;
+define void @st4q_ss_i8(<vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1, <vscale x 16 x i8> %v2,<vscale x 16 x i8> %v3, <vscale x 16 x i1> %pred, ptr %addr, i64 %offset) {
+; CHECK-LABEL: st4q_ss_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: st4q { z0.q - z3.q }, p0, [x0, x1, lsl #4]
+; CHECK-NEXT: ret
+ %1 = getelementptr i128, ptr %addr, i64 %offset
+ call void @llvm.aarch64.sve.st4q.nxv16i8(<vscale x 16 x i8>%v0,
+ <vscale x 16 x i8> %v1,
+ <vscale x 16 x i8> %v2,
+ <vscale x 16 x i8> %v3,
+ <vscale x 16 x i1> %pred,
+ ptr %1)
+ ret void
+}
+
+define void @st4q_ss_i16(<vscale x 8 x i16> %v0, <vscale x 8 x i16> %v1, <vscale x 8 x i16> %v2, <vscale x 8 x i16> %v3, <vscale x 8 x i1> %pred, ptr %addr, i64 %offset) {
+; CHECK-LABEL: st4q_ss_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: st4q { z0.q - z3.q }, p0, [x0, x1, lsl #4]
+; CHECK-NEXT: ret
+ %1 = getelementptr i128, ptr %addr, i64 %offset
+ call void @llvm.aarch64.sve.st4q.nxv8i16(<vscale x 8 x i16> %v0,
+ <vscale x 8 x i16> %v1,
+ <vscale x 8 x i16> %v2,
+ <vscale x 8 x i16> %v3,
+ <vscale x 8 x i1> %pred,
+ ptr %1)
+ ret void
+}
+
+define void @st4q_ss_i32(<vscale x 4 x i32> %v0, <vscale x 4 x i32> %v1, <vscale x 4 x i32> %v2, <vscale x 4 x i32> %v3, <vscale x 4 x i1> %pred, ptr %addr, i64 %offset) {
+; CHECK-LABEL: st4q_ss_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: st4q { z0.q - z3.q }, p0, [x0, x1, lsl #4]
+; CHECK-NEXT: ret
+ %1 = getelementptr i128, ptr %addr, i64 %offset
+ call void @llvm.aarch64.sve.st4q.nxv4i32(<vscale x 4 x i32> %v0,
+ <vscale x 4 x i32> %v1,
+ <vscale x 4 x i32> %v2,
+ <vscale x 4 x i32> %v3,
+ <vscale x 4 x i1> %pred,
+ ptr %1)
+ ret void
+}
+
+define void @st4q_ss_i64(<vscale x 2 x i64> %v0, <vscale x 2 x i64> %v1, <vscale x 2 x i64> %v2, <vscale x 2 x i64> %v3, <vscale x 2 x i1> %pred, ptr %addr, i64 %offset) {
+; CHECK-LABEL: st4q_ss_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: st4q { z0.q - z3.q }, p0, [x0, x1, lsl #4]
+; CHECK-NEXT: ret
+ %1 = getelementptr i128, ptr %addr, i64 %offset
+ call void @llvm.aarch64.sve.st4q.nxv2i64(<vscale x 2 x i64> %v0,
+ <vscale x 2 x i64> %v1,
+ <vscale x 2 x i64> %v2,
+ <vscale x 2 x i64> %v3,
+ <vscale x 2 x i1> %pred,
+ ptr %1)
+ ret void
+}
+
+define void @st4q_ss_f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale x 8 x half> %v2, <vscale x 8 x half> %v3, <vscale x 8 x i1> %pred, ptr %addr, i64 %offset) {
+; CHECK-LABEL: st4q_ss_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: st4q { z0.q - z3.q }, p0, [x0, x1, lsl #4]
+; CHECK-NEXT: ret
+ %1 = getelementptr i128, ptr %addr, i64 %offset
+ call void @llvm.aarch64.sve.st4q.nxv8f16(<vscale x 8 x half> %v0,
+ <vscale x 8 x half> %v1,
+ <vscale x 8 x half> %v2,
+ <vscale x 8 x half> %v3,
+ <vscale x 8 x i1> %pred,
+ ptr %1)
+ ret void
+}
+
+define void @st4q_ss_f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscale x 4 x float> %v2, <vscale x 4 x float> %v3, <vscale x 4 x i1> %pred, ptr %addr, i64 %offset) {
+; CHECK-LABEL: st4q_ss_f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: st4q { z0.q - z3.q }, p0, [x0, x1, lsl #4]
+; CHECK-NEXT: ret
+ %1 = getelementptr i128, ptr %addr, i64 %offset
+ call void @llvm.aarch64.sve.st4q.nxv4f32(<vscale x 4 x float> %v0,
+ <vscale x 4 x float> %v1,
+ <vscale x 4 x float> %v2,
+ <vscale x 4 x float> %v3,
+ <vscale x 4 x i1> %pred,
+ ptr %1)
+ ret void
+}
+
+define void @st4q_ss_f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vscale x 2 x double> %v2, <vscale x 2 x double> %v3, <vscale x 2 x i1> %pred, ptr %addr, i64 %offset) {
+; CHECK-LABEL: st4q_ss_f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: st4q { z0.q - z3.q }, p0, [x0, x1, lsl #4]
+; CHECK-NEXT: ret
+ %1 = getelementptr i128, ptr %addr, i64 %offset
+ call void @llvm.aarch64.sve.st4q.nxv2f64(<vscale x 2 x double> %v0,
+ <vscale x 2 x double> %v1,
+ <vscale x 2 x double> %v2,
+ <vscale x 2 x double> %v3,
+ <vscale x 2 x i1> %pred,
+ ptr %1)
+ ret void
+}
+
+define void @st4q_ss_bf16(<vscale x 8 x bfloat> %v0, <vscale x 8 x bfloat> %v1, <vscale x 8 x bfloat> %v2, <vscale x 8 x bfloat> %v3, <vscale x 8 x i1> %pred, ptr %addr, i64 %offset) {
+; CHECK-LABEL: st4q_ss_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: st4q { z0.q - z3.q }, p0, [x0, x1, lsl #4]
+; CHECK-NEXT: ret
+ %1 = getelementptr i128, ptr %addr, i64 %offset
+ call void @llvm.aarch64.sve.st4q.nxv8bf16(<vscale x 8 x bfloat> %v0,
+ <vscale x 8 x bfloat> %v1,
+ <vscale x 8 x bfloat> %v2,
+ <vscale x 8 x bfloat> %v3,
+ <vscale x 8 x i1> %pred,
+ ptr %1)
+ ret void
+}
+
+define void @st4q_si_i8_off32(<vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1, <vscale x 16 x i8> %v2,<vscale x 16 x i8> %v3, <vscale x 16 x i1> %pred, <vscale x 16 x i8>* %addr) {
+; CHECK-LABEL: st4q_si_i8_off32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: st4q { z0.q - z3.q }, p0, [x0, #-32, mul vl]
+; CHECK-NEXT: ret
+ %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 -32
+ call void @llvm.aarch64.sve.st4q.nxv16i8(<vscale x 16 x i8> %v0,
+ <vscale x 16 x i8> %v1,
+ <vscale x 16 x i8> %v2,
+ <vscale x 16 x i8> %v3,
+ <vscale x 16 x i1> %pred,
+ i8* %base)
+ ret void
+}
+
+define void @st4q_si_i8_off28(<vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1, <vscale x 16 x i8> %v2,<vscale x 16 x i8> %v3, <vscale x 16 x i1> %pred, <vscale x 16 x i8>* %addr) {
+; CHECK-LABEL: st4q_si_i8_off28:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: st4q { z0.q - z3.q }, p0, [x0, #28, mul vl]
+; CHECK-NEXT: ret
+ %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 28
+ call void @llvm.aarch64.sve.st4q.nxv16i8(<vscale x 16 x i8> %v0,
+ <vscale x 16 x i8> %v1,
+ <vscale x 16 x i8> %v2,
+ <vscale x 16 x i8> %v3,
+ <vscale x 16 x i1> %pred,
+ i8* %base)
+ ret void
+}
+
+define void @st4q_si_i16(<vscale x 8 x i16> %v0, <vscale x 8 x i16> %v1, <vscale x 8 x i16> %v2, <vscale x 8 x i16> %v3, <vscale x 8 x i1> %pred, <vscale x 8 x i16>* %addr) {
+; CHECK-LABEL: st4q_si_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: st4q { z0.q - z3.q }, p0, [x0, #28, mul vl]
+; CHECK-NEXT: ret
+ %base = getelementptr <vscale x 8 x i16>, <vscale x 8 x i16>* %addr, i64 28
+ call void @llvm.aarch64.sve.st4q.nxv8i16(<vscale x 8 x i16> %v0,
+ <vscale x 8 x i16> %v1,
+ <vscale x 8 x i16> %v2,
+ <vscale x 8 x i16> %v3,
+ <vscale x 8 x i1> %pred,
+ i8* %base)
+ ret void
+}
+
+define void @st4q_si_i32(<vscale x 4 x i32> %v0, <vscale x 4 x i32> %v1, <vscale x 4 x i32> %v2, <vscale x 4 x i32> %v3, <vscale x 4 x i1> %pred, <vscale x 4 x i32>* %addr) {
+; CHECK-LABEL: st4q_si_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: st4q { z0.q - z3.q }, p0, [x0, #28, mul vl]
+; CHECK-NEXT: ret
+ %base1 = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %addr, i64 28
+ call void @llvm.aarch64.sve.st4q.nxv4i32(<vscale x 4 x i32> %v0,
+ <vscale x 4 x i32> %v1,
+ <vscale x 4 x i32> %v2,
+ <vscale x 4 x i32> %v3,
+ <vscale x 4 x i1> %pred,
+ i32* %base1)
+ ret void
+}
+
+define void @st4q_si_i64(<vscale x 2 x i64> %v0, <vscale x 2 x i64> %v1, <vscale x 2 x i64> %v2, <vscale x 2 x i64> %v3, <vscale x 2 x i1> %pred, <vscale x 2 x i64>* %addr) {
+; CHECK-LABEL: st4q_si_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: st4q { z0.q - z3.q }, p0, [x0, #28, mul vl]
+; CHECK-NEXT: ret
+ %base = getelementptr <vscale x 2 x i64>, <vscale x 2 x i64>* %addr, i64 28
+ call void @llvm.aarch64.sve.st4q.nxv2i64(<vscale x 2 x i64> %v0,
+ <vscale x 2 x i64> %v1,
+ <vscale x 2 x i64> %v2,
+ <vscale x 2 x i64> %v3,
+ <vscale x 2 x i1> %pred,
+ i64* %base)
+ ret void
+}
+
+define void @st4q_si_f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale x 8 x half> %v2, <vscale x 8 x half> %v3, <vscale x 8 x i1> %pred, <vscale x 8 x half>* %addr) {
+; CHECK-LABEL: st4q_si_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: st4q { z0.q - z3.q }, p0, [x0, #28, mul vl]
+; CHECK-NEXT: ret
+ %base = getelementptr <vscale x 8 x half>, <vscale x 8 x half>* %addr, i64 28
+ call void @llvm.aarch64.sve.st4q.nxv8f16(<vscale x 8 x half> %v0,
+ <vscale x 8 x half> %v1,
+ <vscale x 8 x half> %v2,
+ <vscale x 8 x half> %v3,
+ <vscale x 8 x i1> %pred,
+ half* %base)
+ ret void
+}
+
+define void @st4q_si_f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscale x 4 x float> %v2,<vscale x 4 x float> %v3, <vscale x 4 x i1> %pred, <vscale x 4 x float>* %addr) {
+; CHECK-LABEL: st4q_si_f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: st4q { z0.q - z3.q }, p0, [x0, #28, mul vl]
+; CHECK-NEXT: ret
+ %base = getelementptr <vscale x 4 x float>, <vscale x 4 x float>* %addr, i64 28
+ call void @llvm.aarch64.sve.st4q.nxv4f32(<vscale x 4 x float> %v0,
+ <vscale x 4 x float> %v1,
+ <vscale x 4 x float> %v2,
+ <vscale x 4 x float> %v3,
+ <vscale x 4 x i1> %pred,
+ float* %base)
+ ret void
+}
+
+define void @st4q_si_f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vscale x 2 x double> %v2, <vscale x 2 x double> %v3, <vscale x 2 x i1> %pred, <vscale x 2 x double>* %addr) {
+; CHECK-LABEL: st4q_si_f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: st4q { z0.q - z3.q }, p0, [x0, #28, mul vl]
+; CHECK-NEXT: ret
+ %base = getelementptr <vscale x 2 x double>, <vscale x 2 x double>* %addr, i64 28
+ call void @llvm.aarch64.sve.st4q.nxv2f64(<vscale x 2 x double> %v0,
+ <vscale x 2 x double> %v1,
+ <vscale x 2 x double> %v2,
+ <vscale x 2 x double> %v3,
+ <vscale x 2 x i1> %pred,
+ double* %base)
+ ret void
+}
+
+define void @st4q_si_bf16(<vscale x 8 x bfloat> %v0, <vscale x 8 x bfloat> %v1, <vscale x 8 x bfloat> %v2, <vscale x 8 x bfloat> %v3, <vscale x 8 x i1> %pred, <vscale x 8 x bfloat>* %addr) {
+; CHECK-LABEL: st4q_si_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: st4q { z0.q - z3.q }, p0, [x0, #28, mul vl]
+; CHECK-NEXT: ret
+ %base = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat>* %addr, i64 28
+ call void @llvm.aarch64.sve.st4q.nxv8bf16(<vscale x 8 x bfloat> %v0,
+ <vscale x 8 x bfloat> %v1,
+ <vscale x 8 x bfloat> %v2,
+ <vscale x 8 x bfloat> %v3,
+ <vscale x 8 x i1> %pred,
+ bfloat* %base)
+ ret void
+}
+
+
+declare void @llvm.aarch64.sve.st2q.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i1>, ptr)
+declare void @llvm.aarch64.sve.st2q.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i1>, ptr)
+declare void @llvm.aarch64.sve.st2q.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, ptr)
+declare void @llvm.aarch64.sve.st2q.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i1>, ptr)
+
+declare void @llvm.aarch64.sve.st2q.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x i1>, ptr)
+declare void @llvm.aarch64.sve.st2q.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x i1>, ptr)
+declare void @llvm.aarch64.sve.st2q.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x i1>, ptr)
+declare void @llvm.aarch64.sve.st2q.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x i1>, ptr)
+
+declare void @llvm.aarch64.sve.st3q.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>,<vscale x 16 x i8>, <vscale x 16 x i1>, ptr)
+declare void @llvm.aarch64.sve.st3q.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i1>, ptr)
+declare void @llvm.aarch64.sve.st3q.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, ptr)
+declare void @llvm.aarch64.sve.st3q.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i1>, ptr)
+
+declare void @llvm.aarch64.sve.st3q.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x i1>, ptr)
+declare void @llvm.aarch64.sve.st3q.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x i1>, ptr)
+declare void @llvm.aarch64.sve.st3q.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x i1>, ptr)
+declare void @llvm.aarch64.sve.st3q.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x i1>, ptr)
+
+declare void @llvm.aarch64.sve.st4q.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>,<vscale x 16 x i1>, ptr)
+declare void @llvm.aarch64.sve.st4q.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i1>, ptr)
+declare void @llvm.aarch64.sve.st4q.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i1>, ptr)
+declare void @llvm.aarch64.sve.st4q.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i1>, ptr)
+
+declare void @llvm.aarch64.sve.st4q.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x i1>, ptr)
+declare void @llvm.aarch64.sve.st4q.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x i1>, ptr)
+declare void @llvm.aarch64.sve.st4q.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x i1>, ptr)
+declare void @llvm.aarch64.sve.st4q.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x i1>, ptr)
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-scatter-stores-128bit-unscaled-offset.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-scatter-stores-128bit-unscaled-offset.ll
new file mode 100644
index 000000000000000..7cb2eea50f9309b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-scatter-stores-128bit-unscaled-offset.ll
@@ -0,0 +1,113 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1,+bf16 < %s | FileCheck %s
+
+;
+; ST1Q: vector base + unscaled offset
+; e.g. st1q { z0.q }, p0, [z1.d, x0]
+;
+
+define void @sst1_scatter_u64base_offset_i8(<vscale x 16 x i8> %data, <vscale x 16 x i1> %pg, <vscale x 2 x i64> %b, i64 %offset) {
+; CHECK-LABEL: sst1_scatter_u64base_offset_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: st1q { z0.q }, p0, [z1.d, x0]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv16i8.nxv2i64(<vscale x 16 x i8> %data,
+ <vscale x 16 x i1> %pg,
+ <vscale x 2 x i64> %b,
+ i64 %offset)
+ ret void
+}
+
+define void @sst1_scatter_u64base_offset_i16(<vscale x 8 x i16> %data, <vscale x 8 x i1> %pg, <vscale x 2 x i64> %b, i64 %offset) {
+; CHECK-LABEL: sst1_scatter_u64base_offset_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: st1q { z0.q }, p0, [z1.d, x0]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8i16.nxv2i64(<vscale x 8 x i16> %data,
+ <vscale x 8 x i1> %pg,
+ <vscale x 2 x i64> %b,
+ i64 %offset)
+ ret void
+}
+
+define void @sst1_scatter_u64base_offset_i32(<vscale x 4 x i32> %data, <vscale x 4 x i1> %pg, <vscale x 2 x i64> %b, i64 %offset) {
+; CHECK-LABEL: sst1_scatter_u64base_offset_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: st1q { z0.q }, p0, [z1.d, x0]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv4i32.nxv2i64(<vscale x 4 x i32> %data,
+ <vscale x 4 x i1> %pg,
+ <vscale x 2 x i64> %b,
+ i64 %offset)
+ ret void
+}
+
+define void @sst1_scatter_u64base_offset_i64(<vscale x 2 x i64> %data, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %b, i64 %offset) {
+; CHECK-LABEL: sst1_scatter_u64base_offset_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: st1q { z0.q }, p0, [z1.d, x0]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i64> %data,
+ <vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %b,
+ i64 %offset)
+ ret void
+}
+
+define void @sst1_scatter_u64base_offset_f16(<vscale x 8 x half> %data, <vscale x 8 x i1> %pg, <vscale x 2 x i64> %base, i64 %offset) {
+; CHECK-LABEL: sst1_scatter_u64base_offset_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: st1q { z0.q }, p0, [z1.d, x0]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8f16.nxv2i64(<vscale x 8 x half> %data,
+ <vscale x 8 x i1> %pg,
+ <vscale x 2 x i64> %base,
+ i64 %offset)
+ ret void
+}
+
+define void @sst1_scatter_u64base_offset_f32(<vscale x 4 x float> %data, <vscale x 4 x i1> %pg, <vscale x 2 x i64> %b, i64 %offset) {
+; CHECK-LABEL: sst1_scatter_u64base_offset_f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: st1q { z0.q }, p0, [z1.d, x0]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv4f32.nxv2i64(<vscale x 4 x float> %data,
+ <vscale x 4 x i1> %pg,
+ <vscale x 2 x i64> %b,
+ i64 %offset)
+ ret void
+}
+
+define void @sst1_scatter_u64base_offset_f64(<vscale x 2 x double> %data, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %b, i64 %offset) {
+; CHECK-LABEL: sst1_scatter_u64base_offset_f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: st1q { z0.q }, p0, [z1.d, x0]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2f64.nxv2i64(<vscale x 2 x double> %data,
+ <vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %b,
+ i64 %offset)
+ ret void
+}
+
+define void @sst1_scatter_u64base_offset_bf16(<vscale x 8 x bfloat> %data, <vscale x 8 x i1> %pg, <vscale x 2 x i64> %b, i64 %offset) {
+; CHECK-LABEL: sst1_scatter_u64base_offset_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: st1q { z0.q }, p0, [z1.d, x0]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8bf16.nxv2i64(<vscale x 8 x bfloat> %data,
+ <vscale x 8 x i1> %pg,
+ <vscale x 2 x i64> %b,
+ i64 %offset)
+ ret void
+}
+
+declare void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv16i8.nxv2i64(<vscale x 16 x i8>, <vscale x 16 x i1>, <vscale x 2 x i64>, i64)
+declare void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8i16.nxv2i64(<vscale x 8 x i16>, <vscale x 8 x i1>, <vscale x 2 x i64>, i64)
+declare void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv4i32.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i1>, <vscale x 2 x i64>, i64)
+declare void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, <vscale x 2 x i64>, i64)
+
+declare void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8f16.nxv2i64(<vscale x 8 x half>, <vscale x 8 x i1>, <vscale x 2 x i64>, i64)
+declare void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv4f32.nxv2i64(<vscale x 4 x float>, <vscale x 4 x i1>, <vscale x 2 x i64>, i64)
+declare void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2f64.nxv2i64(<vscale x 2 x double>, <vscale x 2 x i1>, <vscale x 2 x i64>, i64)
+declare void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8bf16.nxv2i64(<vscale x 8 x bfloat>, <vscale x 8 x i1>, <vscale x 2 x i64>, i64)
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-st1-single.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-st1-single.ll
new file mode 100644
index 000000000000000..e93673c79c30a38
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-st1-single.ll
@@ -0,0 +1,130 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve2p1 < %s | FileCheck %s
+
+; ST1W
+
+define void @test_svst1uwq_i32_ss(<vscale x 4 x i32> %zt, <vscale x 1 x i1> %pred, ptr %base, i64 %offset) {
+; CHECK-LABEL: test_svst1uwq_i32_ss:
+; CHECK: // %bb.0:
+; CHECK-NEXT: st1w { z0.q }, p0, [x0, x1, lsl #2]
+; CHECK-NEXT: ret
+ %gep = getelementptr i32, ptr %base, i64 %offset
+ call void @llvm.aarch64.sve.st1uwq.nxv4i32(<vscale x 4 x i32> %zt, <vscale x 1 x i1> %pred, ptr %gep)
+ ret void
+}
+
+define void @test_svst1uwq_i32_si(<vscale x 4 x i32> %zt, <vscale x 1 x i1> %pred, <vscale x 1 x i32>* %base) {
+; CHECK-LABEL: test_svst1uwq_i32_si:
+; CHECK: // %bb.0:
+; CHECK-NEXT: st1w { z0.q }, p0, [x0, #-8, mul vl]
+; CHECK-NEXT: st1w { z0.q }, p0, [x0, #7, mul vl]
+; CHECK-NEXT: ret
+ %gep1 = getelementptr inbounds <vscale x 1 x i32>, <vscale x 1 x i32>* %base, i64 -8
+ call void @llvm.aarch64.sve.st1uwq.nxv4i32(<vscale x 4 x i32> %zt, <vscale x 1 x i1> %pred, ptr %gep1)
+
+ %gep2 = getelementptr inbounds <vscale x 1 x i32>, <vscale x 1 x i32>* %base, i64 7
+ call void @llvm.aarch64.sve.st1uwq.nxv4i32(<vscale x 4 x i32> %zt, <vscale x 1 x i1> %pred, ptr %gep2)
+ ret void
+}
+
+define void @test_svst1uwq_i32_out_of_bound(<vscale x 4 x i32> %zt, <vscale x 1 x i1> %pred, <vscale x 1 x i32>* %base) {
+; CHECK-LABEL: test_svst1uwq_i32_out_of_bound:
+; CHECK: // %bb.0:
+; CHECK-NEXT: addvl x8, x0, #2
+; CHECK-NEXT: st1w { z0.q }, p0, [x8]
+; CHECK-NEXT: ret
+ %gep = getelementptr inbounds <vscale x 1 x i32>, <vscale x 1 x i32>* %base, i64 8
+ call void @llvm.aarch64.sve.st1uwq.nxv4i32(<vscale x 4 x i32> %zt, <vscale x 1 x i1> %pred, ptr %gep)
+ ret void
+}
+
+define void @test_svst1uwq_f32_ss(<vscale x 4 x float> %zt, <vscale x 1 x i1> %pred, ptr %base, i64 %offset) {
+; CHECK-LABEL: test_svst1uwq_f32_ss:
+; CHECK: // %bb.0:
+; CHECK-NEXT: st1w { z0.q }, p0, [x0, x1, lsl #2]
+; CHECK-NEXT: ret
+ %gep = getelementptr float, ptr %base, i64 %offset
+ call void @llvm.aarch64.sve.st1uwq.nxv4f32(<vscale x 4 x float> %zt, <vscale x 1 x i1> %pred, ptr %gep)
+ ret void
+}
+
+define void @test_svst1uwq_f32_si(<vscale x 4 x float> %zt, <vscale x 1 x i1> %pred, <vscale x 1 x float>* %base) {
+; CHECK-LABEL: test_svst1uwq_f32_si:
+; CHECK: // %bb.0:
+; CHECK-NEXT: st1w { z0.q }, p0, [x0, #-8, mul vl]
+; CHECK-NEXT: st1w { z0.q }, p0, [x0, #7, mul vl]
+; CHECK-NEXT: ret
+ %gep1 = getelementptr inbounds <vscale x 1 x float>, <vscale x 1 x float>* %base, i64 -8
+ call void @llvm.aarch64.sve.st1uwq.nxv4f32(<vscale x 4 x float> %zt, <vscale x 1 x i1> %pred, ptr %gep1)
+
+ %gep2 = getelementptr inbounds <vscale x 1 x float>, <vscale x 1 x float>* %base, i64 7
+ call void @llvm.aarch64.sve.st1uwq.nxv4f32(<vscale x 4 x float> %zt, <vscale x 1 x i1> %pred, ptr %gep2)
+ ret void
+}
+
+; ST1D
+
+define void @test_svst1udq_i64_ss(<vscale x 2 x i64> %zt, <vscale x 1 x i1> %pred, ptr %base, i64 %offset) {
+; CHECK-LABEL: test_svst1udq_i64_ss:
+; CHECK: // %bb.0:
+; CHECK-NEXT: st1d { z0.q }, p0, [x0, x1, lsl #3]
+; CHECK-NEXT: ret
+ %gep = getelementptr i64, ptr %base, i64 %offset
+ call void @llvm.aarch64.sve.st1udq.nxv2i64(<vscale x 2 x i64> %zt, <vscale x 1 x i1> %pred, ptr %gep)
+ ret void
+}
+
+define void @test_svst1udq_i64_si(<vscale x 2 x i64> %zt, <vscale x 1 x i1> %pred, <vscale x 1 x i64>* %base) {
+; CHECK-LABEL: test_svst1udq_i64_si:
+; CHECK: // %bb.0:
+; CHECK-NEXT: st1d { z0.q }, p0, [x0, #-8, mul vl]
+; CHECK-NEXT: st1d { z0.q }, p0, [x0, #7, mul vl]
+; CHECK-NEXT: ret
+ %gep1 = getelementptr inbounds <vscale x 1 x i64>, <vscale x 1 x i64>* %base, i64 -8
+ call void @llvm.aarch64.sve.st1udq.nxv2i64(<vscale x 2 x i64> %zt, <vscale x 1 x i1> %pred, ptr %gep1)
+
+ %gep2 = getelementptr inbounds <vscale x 1 x i64>, <vscale x 1 x i64>* %base, i64 7
+ call void @llvm.aarch64.sve.st1udq.nxv2i64(<vscale x 2 x i64> %zt, <vscale x 1 x i1> %pred, ptr %gep2)
+ ret void
+}
+
+define void @test_svst1udq_i64_out_of_bound(<vscale x 2 x i64> %zt, <vscale x 1 x i1> %pred, <vscale x 1 x i64>* %base) {
+; CHECK-LABEL: test_svst1udq_i64_out_of_bound:
+; CHECK: // %bb.0:
+; CHECK-NEXT: addvl x8, x0, #-5
+; CHECK-NEXT: st1d { z0.q }, p0, [x8]
+; CHECK-NEXT: ret
+ %gep = getelementptr inbounds <vscale x 1 x i64>, <vscale x 1 x i64>* %base, i64 -10
+ call void @llvm.aarch64.sve.st1udq.nxv2i64(<vscale x 2 x i64> %zt, <vscale x 1 x i1> %pred, ptr %gep)
+ ret void
+}
+
+define void @test_svst1udq_f64_ss(<vscale x 2 x double> %zt, <vscale x 1 x i1> %pred, ptr %base, i64 %offset) {
+; CHECK-LABEL: test_svst1udq_f64_ss:
+; CHECK: // %bb.0:
+; CHECK-NEXT: st1d { z0.q }, p0, [x0, x1, lsl #3]
+; CHECK-NEXT: ret
+ %gep = getelementptr double, ptr %base, i64 %offset
+ call void @llvm.aarch64.sve.st1udq.nxv2f64(<vscale x 2 x double> %zt, <vscale x 1 x i1> %pred, ptr %gep)
+ ret void
+}
+
+define void @test_svst1udq_f64_si(<vscale x 2 x double> %zt, <vscale x 1 x i1> %pred, <vscale x 1 x double>* %base) {
+; CHECK-LABEL: test_svst1udq_f64_si:
+; CHECK: // %bb.0:
+; CHECK-NEXT: st1d { z0.q }, p0, [x0, #-8, mul vl]
+; CHECK-NEXT: st1d { z0.q }, p0, [x0, #7, mul vl]
+; CHECK-NEXT: ret
+ %gep1 = getelementptr inbounds <vscale x 1 x double>, <vscale x 1 x double>* %base, i64 -8
+ call void @llvm.aarch64.sve.st1udq.nxv2f64(<vscale x 2 x double> %zt, <vscale x 1 x i1> %pred, ptr %gep1)
+
+ %gep2 = getelementptr inbounds <vscale x 1 x double>, <vscale x 1 x double>* %base, i64 7
+ call void @llvm.aarch64.sve.st1udq.nxv2f64(<vscale x 2 x double> %zt, <vscale x 1 x i1> %pred, ptr %gep2)
+ ret void
+}
+
+declare void @llvm.aarch64.sve.st1uwq.nxv4i32(<vscale x 4 x i32>, <vscale x 1 x i1>, ptr)
+declare void @llvm.aarch64.sve.st1uwq.nxv4f32(<vscale x 4 x float>, <vscale x 1 x i1>, ptr)
+
+declare void @llvm.aarch64.sve.st1udq.nxv2i64(<vscale x 2 x i64>, <vscale x 1 x i1>, ptr)
+declare void @llvm.aarch64.sve.st1udq.nxv2f64(<vscale x 2 x double>, <vscale x 1 x i1>, ptr)
>From 0422343f6d7edb9e72da081d85d583fe321c34ec Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Fri, 27 Oct 2023 16:09:07 +0100
Subject: [PATCH 2/2] [AArch64] Add SVE2.1 intrinsics for indexed quadword
gather loads and scatter stores
This patch adds the quadword gather load intrinsics of the form
(1) sv<type>_t svld1q_gather_u64index_<typ>(svbool_t, const <type>_t *, svuint64_t);
(2) sv<type>_t svld1q_gather_u64base_index_<typ>(svbool_t, svuint64_t, int64_t);
and the quadword scatter store intrinsics of the form
(3) void svst1q_scatter_u64index_<typ>(svbool_t, <type>_t *, svuint64_t, sv<type>_t);
(4) void svst1q_scatter_u64base_index_<typ>(svbool, svuint64_t, int64_t, sv<type>_t);
(intrinsics (1) and (3) are currently missing the variants for non 64-bit sized
base types, e.g. `int8_t` or `bfloat16_t`, etc).
ACLE spec: https://github.com/ARM-software/acle/pull/257
---
clang/include/clang/Basic/arm_sve.td | 12 +
.../acle_sve2p1_loads.c | 256 ++++++++++++++++++
.../acle_sve2p1_store.c | 256 ++++++++++++++++++
llvm/include/llvm/IR/IntrinsicsAArch64.td | 6 +
.../Target/AArch64/AArch64ISelLowering.cpp | 39 ++-
llvm/lib/Target/AArch64/AArch64ISelLowering.h | 2 +
...p1-intrinsics-gather-loads-128bit-index.ll | 187 +++++++++++++
...-intrinsics-scatter-stores-128bit-index.ll | 186 +++++++++++++
8 files changed, 933 insertions(+), 11 deletions(-)
create mode 100644 llvm/test/CodeGen/AArch64/sve2p1-intrinsics-gather-loads-128bit-index.ll
create mode 100644 llvm/test/CodeGen/AArch64/sve2p1-intrinsics-scatter-stores-128bit-index.ll
diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index ff9a4062a35912c..1a295c290ad09e1 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -319,6 +319,12 @@ let TargetGuard = "sve2p1" in {
defm SVLD2Q_VNUM : StructLoad<"svld2q_vnum[_{2}]", "2Pcl", "aarch64_sve_ld2q_sret">;
defm SVLD3Q_VNUM : StructLoad<"svld3q_vnum[_{2}]", "3Pcl", "aarch64_sve_ld3q_sret">;
defm SVLD4Q_VNUM : StructLoad<"svld4q_vnum[_{2}]", "4Pcl", "aarch64_sve_ld4q_sret">;
+
+ // Load quadwords (scalar base + vector index)
+ def SVLD1Q_GATHER_INDICES_U : MInst<"svld1q_gather_[{3}]index[_{0}]", "dPcg", "lUld", [IsGatherLoad], MemEltTyDefault, "aarch64_sve_ld1q_gather_index">;
+
+ // Load quadwords (vector base + scalar index)
+ def SVLD1Q_GATHER_INDEX_S : MInst<"svld1q_gather[_{2}base]_index_{0}", "dPgl", "cUcsUsiUilUlbhfd", [IsGatherLoad], MemEltTyDefault, "aarch64_sve_ld1q_gather_scalar_offset">;
}
////////////////////////////////////////////////////////////////////////////////
@@ -464,6 +470,12 @@ let TargetGuard = "sve2p1" in {
defm SVST2Q_VNUM : StructStore<"svst2q_vnum[_{d}]", "vPcl2", "aarch64_sve_st2q">;
defm SVST3Q_VNUM : StructStore<"svst3q_vnum[_{d}]", "vPcl3", "aarch64_sve_st3q">;
defm SVST4Q_VNUM : StructStore<"svst4q_vnum[_{d}]", "vPcl4", "aarch64_sve_st4q">;
+
+ // Scatter store quadwords (scalar base + vector index)
+ def SVST1Q_SCATTER_INDICES_U : MInst<"svst1q_scatter_[{3}]index[_{0}]", "vPpgd", "lUld", [IsScatterStore], MemEltTyDefault, "aarch64_sve_st1q_scatter_index">;
+
+ // Scatter store quadwords (vector base + scalar index)
+ def SVST1Q_SCATTER_INDEX_S : MInst<"svst1q_scatter[_{2}base]_index[_{0}]", "vPgld", "cUcsUsiUilUlbhfd", [IsScatterStore], MemEltTyDefault, "aarch64_sve_st1q_scatter_scalar_offset">;
}
////////////////////////////////////////////////////////////////////////////////
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_loads.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_loads.c
index 65d84219e7eb0d8..5f45d7d19aea845 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_loads.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_loads.c
@@ -2493,3 +2493,259 @@ svbfloat16_t test_svld1q_gather_u64base_bf16(svbool_t pg, svuint64_t base)
{
return SVE_ACLE_FUNC(svld1q_gather,_u64base,_bf16,)(pg, base);
}
+
+// CHECK-LABEL: @test_svld1q_gather_u64index_s64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1q.gather.index.nxv2i64(<vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CHECK-NEXT: ret <vscale x 2 x i64> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z31test_svld1q_gather_u64index_s64u10__SVBool_tPKlu12__SVUint64_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1q.gather.index.nxv2i64(<vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CPP-CHECK-NEXT: ret <vscale x 2 x i64> [[TMP1]]
+//
+svint64_t test_svld1q_gather_u64index_s64(svbool_t pg, const int64_t *base, svuint64_t idx) {
+ return SVE_ACLE_FUNC(svld1q_gather_,u64,index,_s64) (pg, base, idx);
+}
+
+// CHECK-LABEL: @test_svld1q_gather_u64index_u64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1q.gather.index.nxv2i64(<vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CHECK-NEXT: ret <vscale x 2 x i64> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z31test_svld1q_gather_u64index_u64u10__SVBool_tPKmu12__SVUint64_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1q.gather.index.nxv2i64(<vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CPP-CHECK-NEXT: ret <vscale x 2 x i64> [[TMP1]]
+//
+svuint64_t test_svld1q_gather_u64index_u64(svbool_t pg, const uint64_t *base, svuint64_t idx) {
+ return SVE_ACLE_FUNC(svld1q_gather_,u64,index,_u64) (pg, base, idx);
+}
+
+// CHECK-LABEL: @test_svld1q_gather_u64index_f64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.ld1q.gather.index.nxv2f64(<vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CHECK-NEXT: ret <vscale x 2 x double> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z31test_svld1q_gather_u64index_f64u10__SVBool_tPKdu12__SVUint64_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.ld1q.gather.index.nxv2f64(<vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CPP-CHECK-NEXT: ret <vscale x 2 x double> [[TMP1]]
+//
+svfloat64_t test_svld1q_gather_u64index_f64(svbool_t pg, const float64_t *base, svuint64_t idx) {
+ return SVE_ACLE_FUNC(svld1q_gather_,u64,index,_f64) (pg, base, idx);
+}
+
+// CHECK-LABEL: @test_svld1q_gather_u64base_index_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv16i8.nxv2i64(<vscale x 16 x i1> [[PG:%.*]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[IDX:%.*]])
+// CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z35test_svld1q_gather_u64base_index_s8u10__SVBool_tu12__SVUint64_tl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv16i8.nxv2i64(<vscale x 16 x i1> [[PG:%.*]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[IDX:%.*]])
+// CPP-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+//
+svint8_t test_svld1q_gather_u64base_index_s8(svbool_t pg, svuint64_t base, int64_t idx) {
+ return SVE_ACLE_FUNC(svld1q_gather,_u64base,_index_s8,)(pg, base, idx);
+}
+
+// CHECK-LABEL: @test_svld1q_gather_u64base_index_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv16i8.nxv2i64(<vscale x 16 x i1> [[PG:%.*]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[IDX:%.*]])
+// CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z35test_svld1q_gather_u64base_index_u8u10__SVBool_tu12__SVUint64_tl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv16i8.nxv2i64(<vscale x 16 x i1> [[PG:%.*]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[IDX:%.*]])
+// CPP-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+//
+svuint8_t test_svld1q_gather_u64base_index_u8(svbool_t pg, svuint64_t base, int64_t idx) {
+ return SVE_ACLE_FUNC(svld1q_gather,_u64base,_index_u8,)(pg, base, idx);
+}
+
+// CHECK-LABEL: @test_svld1q_gather_u64base_index_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 1
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8i16.nxv2i64(<vscale x 8 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CHECK-NEXT: ret <vscale x 8 x i16> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z36test_svld1q_gather_u64base_index_s16u10__SVBool_tu12__SVUint64_tl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 1
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8i16.nxv2i64(<vscale x 8 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP2]]
+//
+svint16_t test_svld1q_gather_u64base_index_s16(svbool_t pg, svuint64_t base, int64_t idx) {
+ return SVE_ACLE_FUNC(svld1q_gather,_u64base,_index_s16,)(pg, base, idx);
+}
+
+// CHECK-LABEL: @test_svld1q_gather_u64base_index_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 1
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8i16.nxv2i64(<vscale x 8 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CHECK-NEXT: ret <vscale x 8 x i16> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z36test_svld1q_gather_u64base_index_u16u10__SVBool_tu12__SVUint64_tl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 1
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8i16.nxv2i64(<vscale x 8 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP2]]
+//
+svuint16_t test_svld1q_gather_u64base_index_u16(svbool_t pg, svuint64_t base, int64_t idx) {
+ return SVE_ACLE_FUNC(svld1q_gather,_u64base,_index_u16,)(pg, base, idx);
+}
+
+// CHECK-LABEL: @test_svld1q_gather_u64base_index_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 2
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv4i32.nxv2i64(<vscale x 4 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CHECK-NEXT: ret <vscale x 4 x i32> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z36test_svld1q_gather_u64base_index_s32u10__SVBool_tu12__SVUint64_tl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 2
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv4i32.nxv2i64(<vscale x 4 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CPP-CHECK-NEXT: ret <vscale x 4 x i32> [[TMP2]]
+//
+svint32_t test_svld1q_gather_u64base_index_s32(svbool_t pg, svuint64_t base, int64_t idx) {
+ return SVE_ACLE_FUNC(svld1q_gather,_u64base,_index_s32,)(pg, base, idx);
+}
+
+// CHECK-LABEL: @test_svld1q_gather_u64base_index_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 2
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv4i32.nxv2i64(<vscale x 4 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CHECK-NEXT: ret <vscale x 4 x i32> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z36test_svld1q_gather_u64base_index_u32u10__SVBool_tu12__SVUint64_tl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 2
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv4i32.nxv2i64(<vscale x 4 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CPP-CHECK-NEXT: ret <vscale x 4 x i32> [[TMP2]]
+//
+svuint32_t test_svld1q_gather_u64base_index_u32(svbool_t pg, svuint64_t base, int64_t idx) {
+ return SVE_ACLE_FUNC(svld1q_gather,_u64base,_index_u32,)(pg, base, idx);
+}
+
+// CHECK-LABEL: @test_svld1q_gather_u64base_index_s64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 3
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CHECK-NEXT: ret <vscale x 2 x i64> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z36test_svld1q_gather_u64base_index_s64u10__SVBool_tu12__SVUint64_tl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 3
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CPP-CHECK-NEXT: ret <vscale x 2 x i64> [[TMP2]]
+//
+svint64_t test_svld1q_gather_u64base_index_s64(svbool_t pg, svuint64_t base, int64_t idx) {
+ return SVE_ACLE_FUNC(svld1q_gather,_u64base,_index_s64,)(pg, base, idx);
+}
+
+// CHECK-LABEL: @test_svld1q_gather_u64base_index_u64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 3
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CHECK-NEXT: ret <vscale x 2 x i64> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z36test_svld1q_gather_u64base_index_u64u10__SVBool_tu12__SVUint64_tl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 3
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CPP-CHECK-NEXT: ret <vscale x 2 x i64> [[TMP2]]
+//
+svuint64_t test_svld1q_gather_u64base_index_u64(svbool_t pg, svuint64_t base, int64_t idx) {
+ return SVE_ACLE_FUNC(svld1q_gather,_u64base,_index_u64,)(pg, base, idx);
+}
+
+// CHECK-LABEL: @test_svld1q_gather_u64base_index_bf16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 1
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8bf16.nxv2i64(<vscale x 8 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z37test_svld1q_gather_u64base_index_bf16u10__SVBool_tu12__SVUint64_tl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 1
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8bf16.nxv2i64(<vscale x 8 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP2]]
+//
+svbfloat16_t test_svld1q_gather_u64base_index_bf16(svbool_t pg, svuint64_t base, int64_t idx) {
+ return SVE_ACLE_FUNC(svld1q_gather,_u64base,_index_bf16,)(pg, base, idx);
+}
+
+// CHECK-LABEL: @test_svld1q_gather_u64base_index_f16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 1
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8f16.nxv2i64(<vscale x 8 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CHECK-NEXT: ret <vscale x 8 x half> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z36test_svld1q_gather_u64base_index_f16u10__SVBool_tu12__SVUint64_tl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 1
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8f16.nxv2i64(<vscale x 8 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x half> [[TMP2]]
+//
+svfloat16_t test_svld1q_gather_u64base_index_f16(svbool_t pg, svuint64_t base, int64_t idx) {
+ return SVE_ACLE_FUNC(svld1q_gather,_u64base,_index_f16,)(pg, base, idx);
+}
+
+// CHECK-LABEL: @test_svld1q_gather_u64base_index_f32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 2
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv4f32.nxv2i64(<vscale x 4 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CHECK-NEXT: ret <vscale x 4 x float> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z36test_svld1q_gather_u64base_index_f32u10__SVBool_tu12__SVUint64_tl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 2
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv4f32.nxv2i64(<vscale x 4 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CPP-CHECK-NEXT: ret <vscale x 4 x float> [[TMP2]]
+//
+svfloat32_t test_svld1q_gather_u64base_index_f32(svbool_t pg, svuint64_t base, int64_t idx) {
+ return SVE_ACLE_FUNC(svld1q_gather,_u64base,_index_f32,)(pg, base, idx);
+}
+
+// CHECK-LABEL: @test_svld1q_gather_u64base_index_f64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 3
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv2f64.nxv2i64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CHECK-NEXT: ret <vscale x 2 x double> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z36test_svld1q_gather_u64base_index_f64u10__SVBool_tu12__SVUint64_tl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 3
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv2f64.nxv2i64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CPP-CHECK-NEXT: ret <vscale x 2 x double> [[TMP2]]
+//
+svfloat64_t test_svld1q_gather_u64base_index_f64(svbool_t pg, svuint64_t base, int64_t idx) {
+ return SVE_ACLE_FUNC(svld1q_gather,_u64base,_index_f64,)(pg, base, idx);
+}
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_store.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_store.c
index 5584fb16379ed9c..24bf17e7e69bcea 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_store.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_store.c
@@ -2122,3 +2122,259 @@ void test_svst1q_scatter_u64base_bf16(svbool_t pg, svuint64_t base, svbfloat16_t
{
SVE_ACLE_FUNC(svst1q_scatter, _u64base,,_bf16)(pg, base, data);
}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64index_s64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv2i64(<vscale x 2 x i64> [[DATA:%.*]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svst1q_scatter_u64index_s64u10__SVBool_tPlu12__SVUint64_tu11__SVInt64_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv2i64(<vscale x 2 x i64> [[DATA:%.*]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst1q_scatter_u64index_s64(svbool_t pg, int64_t *base, svuint64_t idx, svint64_t data) {
+ SVE_ACLE_FUNC(svst1q_scatter_, u64, index, _s64)(pg, base, idx, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64index_u64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv2i64(<vscale x 2 x i64> [[DATA:%.*]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svst1q_scatter_u64index_u64u10__SVBool_tPmu12__SVUint64_tS1_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv2i64(<vscale x 2 x i64> [[DATA:%.*]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst1q_scatter_u64index_u64(svbool_t pg, uint64_t *base, svuint64_t idx, svuint64_t data) {
+ SVE_ACLE_FUNC(svst1q_scatter_, u64, index, _u64)(pg, base, idx, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64index_f64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv2f64(<vscale x 2 x double> [[DATA:%.*]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svst1q_scatter_u64index_f64u10__SVBool_tPdu12__SVUint64_tu13__SVFloat64_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv2f64(<vscale x 2 x double> [[DATA:%.*]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst1q_scatter_u64index_f64(svbool_t pg, float64_t *base, svuint64_t idx, svfloat64_t data) {
+ SVE_ACLE_FUNC(svst1q_scatter_, u64, index, _f64)(pg, base, idx, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64base_index_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv16i8.nxv2i64(<vscale x 16 x i8> [[DATA:%.*]], <vscale x 16 x i1> [[PG:%.*]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[IDX:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z36test_svst1q_scatter_u64base_index_s8u10__SVBool_tu12__SVUint64_tlu10__SVInt8_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv16i8.nxv2i64(<vscale x 16 x i8> [[DATA:%.*]], <vscale x 16 x i1> [[PG:%.*]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[IDX:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst1q_scatter_u64base_index_s8(svbool_t pg, svuint64_t base, int64_t idx, svint8_t data) {
+ SVE_ACLE_FUNC(svst1q_scatter,_u64base,_index,_s8)(pg, base, idx, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64base_index_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv16i8.nxv2i64(<vscale x 16 x i8> [[DATA:%.*]], <vscale x 16 x i1> [[PG:%.*]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[IDX:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z36test_svst1q_scatter_u64base_index_u8u10__SVBool_tu12__SVUint64_tlu11__SVUint8_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv16i8.nxv2i64(<vscale x 16 x i8> [[DATA:%.*]], <vscale x 16 x i1> [[PG:%.*]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[IDX:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst1q_scatter_u64base_index_u8(svbool_t pg, svuint64_t base, int64_t idx, svuint8_t data) {
+ SVE_ACLE_FUNC(svst1q_scatter,_u64base,_index,_u8)(pg, base, idx, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64base_index_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 1
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8i16.nxv2i64(<vscale x 8 x i16> [[DATA:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z37test_svst1q_scatter_u64base_index_s16u10__SVBool_tu12__SVUint64_tlu11__SVInt16_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 1
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8i16.nxv2i64(<vscale x 8 x i16> [[DATA:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst1q_scatter_u64base_index_s16(svbool_t pg, svuint64_t base, int64_t idx, svint16_t data) {
+ SVE_ACLE_FUNC(svst1q_scatter,_u64base,_index,_s16)(pg, base, idx, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64base_index_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 1
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8i16.nxv2i64(<vscale x 8 x i16> [[DATA:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z37test_svst1q_scatter_u64base_index_u16u10__SVBool_tu12__SVUint64_tlu12__SVUint16_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 1
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8i16.nxv2i64(<vscale x 8 x i16> [[DATA:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst1q_scatter_u64base_index_u16(svbool_t pg, svuint64_t base, int64_t idx, svuint16_t data) {
+ SVE_ACLE_FUNC(svst1q_scatter,_u64base,_index,_u16)(pg, base, idx, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64base_index_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 2
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv4i32.nxv2i64(<vscale x 4 x i32> [[DATA:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z37test_svst1q_scatter_u64base_index_s32u10__SVBool_tu12__SVUint64_tlu11__SVInt32_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 2
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv4i32.nxv2i64(<vscale x 4 x i32> [[DATA:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst1q_scatter_u64base_index_s32(svbool_t pg, svuint64_t base, int64_t idx, svint32_t data) {
+ SVE_ACLE_FUNC(svst1q_scatter,_u64base,_index,_s32)(pg, base, idx, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64base_index_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 2
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv4i32.nxv2i64(<vscale x 4 x i32> [[DATA:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z37test_svst1q_scatter_u64base_index_u32u10__SVBool_tu12__SVUint64_tlu12__SVUint32_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 2
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv4i32.nxv2i64(<vscale x 4 x i32> [[DATA:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst1q_scatter_u64base_index_u32(svbool_t pg, svuint64_t base, int64_t idx, svuint32_t data) {
+ SVE_ACLE_FUNC(svst1q_scatter,_u64base,_index,_u32)(pg, base, idx, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64base_index_s64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 3
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i64> [[DATA:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z37test_svst1q_scatter_u64base_index_s64u10__SVBool_tu12__SVUint64_tlu11__SVInt64_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 3
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i64> [[DATA:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst1q_scatter_u64base_index_s64(svbool_t pg, svuint64_t base, int64_t idx, svint64_t data) {
+ SVE_ACLE_FUNC(svst1q_scatter,_u64base,_index,_s64)(pg, base, idx, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64base_index_u64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 3
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i64> [[DATA:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z37test_svst1q_scatter_u64base_index_u64u10__SVBool_tu12__SVUint64_tlS0_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 3
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i64> [[DATA:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst1q_scatter_u64base_index_u64(svbool_t pg, svuint64_t base, int64_t idx, svuint64_t data) {
+ SVE_ACLE_FUNC(svst1q_scatter,_u64base,_index,_u64)(pg, base, idx, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64base_index_bf16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 1
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8bf16.nxv2i64(<vscale x 8 x bfloat> [[DATA:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z38test_svst1q_scatter_u64base_index_bf16u10__SVBool_tu12__SVUint64_tlu14__SVBfloat16_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 1
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8bf16.nxv2i64(<vscale x 8 x bfloat> [[DATA:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst1q_scatter_u64base_index_bf16(svbool_t pg, svuint64_t base, int64_t idx, svbfloat16_t data) {
+ SVE_ACLE_FUNC(svst1q_scatter,_u64base,_index,_bf16)(pg, base, idx, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64base_index_f16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 1
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8f16.nxv2i64(<vscale x 8 x half> [[DATA:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z37test_svst1q_scatter_u64base_index_f16u10__SVBool_tu12__SVUint64_tlu13__SVFloat16_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 1
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8f16.nxv2i64(<vscale x 8 x half> [[DATA:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst1q_scatter_u64base_index_f16(svbool_t pg, svuint64_t base, int64_t idx, svfloat16_t data) {
+ SVE_ACLE_FUNC(svst1q_scatter,_u64base,_index,_f16)(pg, base, idx, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64base_index_f32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 2
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv4f32.nxv2i64(<vscale x 4 x float> [[DATA:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z37test_svst1q_scatter_u64base_index_f32u10__SVBool_tu12__SVUint64_tlu13__SVFloat32_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 2
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv4f32.nxv2i64(<vscale x 4 x float> [[DATA:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst1q_scatter_u64base_index_f32(svbool_t pg, svuint64_t base, int64_t idx, svfloat32_t data) {
+ SVE_ACLE_FUNC(svst1q_scatter,_u64base,_index,_f32)(pg, base, idx, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64base_index_f64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 3
+// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2f64.nxv2i64(<vscale x 2 x double> [[DATA:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z37test_svst1q_scatter_u64base_index_f64u10__SVBool_tu12__SVUint64_tlu13__SVFloat64_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 3
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2f64.nxv2i64(<vscale x 2 x double> [[DATA:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svst1q_scatter_u64base_index_f64(svbool_t pg, svuint64_t base, int64_t idx, svfloat64_t data) {
+ SVE_ACLE_FUNC(svst1q_scatter,_u64base,_index,_f64)(pg, base, idx, data);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index ee2bcb0a9540d6b..71bbe4354a6ba37 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -2135,6 +2135,9 @@ def int_aarch64_sve_ld1_gather_scalar_offset : AdvSIMD_GatherLoad_VS_Intrinsic;
// 128-bit loads, unscaled offsets
def int_aarch64_sve_ld1q_gather_scalar_offset : AdvSIMD_GatherLoad_VS_Intrinsic;
+// 128-bit loads, scaled offsets (indices)
+def int_aarch64_sve_ld1q_gather_index : AdvSIMD_GatherLoad_SV_64b_Offsets_Intrinsic;
+
//
// First-faulting gather loads: scalar base + vector offsets
//
@@ -2213,6 +2216,9 @@ def int_aarch64_sve_st1_scatter_scalar_offset : AdvSIMD_ScatterStore_VS_Intrinsi
// 128-bit stores, unscaled offsets
def int_aarch64_sve_st1q_scatter_scalar_offset : AdvSIMD_ScatterStore_VS_Intrinsic;
+// 128-bit stores, scaled offsets (indices)
+def int_aarch64_sve_st1q_scatter_index : AdvSIMD_ScatterStore_SV_64b_Offsets_Intrinsic;
+
//
// Non-temporal scatter stores: scalar base + vector offsets
//
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index c96eb86ab4dcc83..d72c9b7ed45f397 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2573,6 +2573,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
MAKE_CASE(AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO)
MAKE_CASE(AArch64ISD::GLD1_IMM_MERGE_ZERO)
MAKE_CASE(AArch64ISD::GLD1Q_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLD1Q_INDEX_MERGE_ZERO)
MAKE_CASE(AArch64ISD::GLD1S_MERGE_ZERO)
MAKE_CASE(AArch64ISD::GLD1S_SCALED_MERGE_ZERO)
MAKE_CASE(AArch64ISD::GLD1S_SXTW_MERGE_ZERO)
@@ -2598,6 +2599,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
MAKE_CASE(AArch64ISD::GLDNT1_INDEX_MERGE_ZERO)
MAKE_CASE(AArch64ISD::GLDNT1S_MERGE_ZERO)
MAKE_CASE(AArch64ISD::SST1Q_PRED)
+ MAKE_CASE(AArch64ISD::SST1Q_INDEX_PRED)
MAKE_CASE(AArch64ISD::ST1_PRED)
MAKE_CASE(AArch64ISD::SST1_PRED)
MAKE_CASE(AArch64ISD::SST1_SCALED_PRED)
@@ -22611,10 +22613,11 @@ static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
// For FPs, ACLE only supports _packed_ single and double precision types.
- // SST1Q_PRED is the ST1Q for sve2p1 and should allow all sizes
+ // SST1Q_[INDEX_]PRED is the ST1Q for sve2p1 and should allow all sizes.
if (SrcElVT.isFloatingPoint())
if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64) &&
- (Opcode != AArch64ISD::SST1Q_PRED ||
+ ((Opcode != AArch64ISD::SST1Q_PRED &&
+ Opcode != AArch64ISD::SST1Q_INDEX_PRED) ||
((SrcVT != MVT::nxv8f16) && (SrcVT != MVT::nxv8bf16))))
return SDValue();
@@ -22632,14 +22635,19 @@ static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG,
Offset =
getScaledOffsetForBitWidth(DAG, Offset, DL, SrcElVT.getSizeInBits());
Opcode = AArch64ISD::SSTNT1_PRED;
+ } else if (Opcode == AArch64ISD::SST1Q_INDEX_PRED) {
+ Offset =
+ getScaledOffsetForBitWidth(DAG, Offset, DL, SrcElVT.getSizeInBits());
+ Opcode = AArch64ISD::SST1Q_PRED;
}
-
+
// In the case of non-temporal gather loads there's only one SVE instruction
// per data-size: "scalar + vector", i.e.
// * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
// Since we do have intrinsics that allow the arguments to be in a different
// order, we may need to swap them to match the spec.
- if (Opcode == AArch64ISD::SSTNT1_PRED && Offset.getValueType().isVector())
+ if ((Opcode == AArch64ISD::SSTNT1_PRED || Opcode == AArch64ISD::SST1Q_PRED) &&
+ Offset.getValueType().isVector())
std::swap(Base, Offset);
// SST1_IMM requires that the offset is an immediate that is:
@@ -22722,21 +22730,26 @@ static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG,
// vector of offsets (that fits into one register)
SDValue Offset = N->getOperand(4);
- // For "scalar + vector of indices", just scale the indices. This only
- // applies to non-temporal gathers because there's no instruction that takes
- // indicies.
+ // For "scalar + vector of indices", scale the indices to obtain unscaled
+ // offsets. This applies to non-temporal and quadword gathers, which do not
+ // have an addressing mode with scaled offset.
if (Opcode == AArch64ISD::GLDNT1_INDEX_MERGE_ZERO) {
Offset = getScaledOffsetForBitWidth(DAG, Offset, DL,
RetVT.getScalarSizeInBits());
Opcode = AArch64ISD::GLDNT1_MERGE_ZERO;
+ } else if (Opcode == AArch64ISD::GLD1Q_INDEX_MERGE_ZERO) {
+ Offset = getScaledOffsetForBitWidth(DAG, Offset, DL,
+ RetVT.getScalarSizeInBits());
+ Opcode = AArch64ISD::GLD1Q_MERGE_ZERO;
}
- // In the case of non-temporal gather loads there's only one SVE instruction
- // per data-size: "scalar + vector", i.e.
- // * ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
+ // In the case of non-temporal gather loads and quadword gather loads there's
+ // only one addressing mode : "vector + scalar", e.g.
+ // ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
// Since we do have intrinsics that allow the arguments to be in a different
// order, we may need to swap them to match the spec.
- if (Opcode == AArch64ISD::GLDNT1_MERGE_ZERO &&
+ if ((Opcode == AArch64ISD::GLDNT1_MERGE_ZERO ||
+ Opcode == AArch64ISD::GLD1Q_MERGE_ZERO) &&
Offset.getValueType().isVector())
std::swap(Base, Offset);
@@ -23586,6 +23599,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_MERGE_ZERO);
case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1Q_MERGE_ZERO);
+ case Intrinsic::aarch64_sve_ld1q_gather_index:
+ return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1Q_INDEX_MERGE_ZERO);
case Intrinsic::aarch64_sve_ld1_gather_index:
return performGatherLoadCombine(N, DAG,
AArch64ISD::GLD1_SCALED_MERGE_ZERO);
@@ -23631,6 +23646,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
AArch64ISD::GLDFF1_IMM_MERGE_ZERO);
case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
return performScatterStoreCombine(N, DAG, AArch64ISD::SST1Q_PRED);
+ case Intrinsic::aarch64_sve_st1q_scatter_index:
+ return performScatterStoreCombine(N, DAG, AArch64ISD::SST1Q_INDEX_PRED);
case Intrinsic::aarch64_sve_st1_scatter:
return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_PRED);
case Intrinsic::aarch64_sve_st1_scatter_index:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index f05aaab86a23135..f5632e0c8d9e580 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -374,6 +374,7 @@ enum NodeType : unsigned {
GLD1_SXTW_SCALED_MERGE_ZERO,
GLD1_IMM_MERGE_ZERO,
GLD1Q_MERGE_ZERO,
+ GLD1Q_INDEX_MERGE_ZERO,
// Signed gather loads
GLD1S_MERGE_ZERO,
@@ -419,6 +420,7 @@ enum NodeType : unsigned {
SST1_SXTW_SCALED_PRED,
SST1_IMM_PRED,
SST1Q_PRED,
+ SST1Q_INDEX_PRED,
// Non-temporal scatter store
SSTNT1_PRED,
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-gather-loads-128bit-index.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-gather-loads-128bit-index.ll
new file mode 100644
index 000000000000000..59b657a229befab
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-gather-loads-128bit-index.ll
@@ -0,0 +1,187 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1,+bf16 < %s | FileCheck %s
+
+declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, i64)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv4i32.nxv2i64(<vscale x 4 x i1>, <vscale x 2 x i64>, i64)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8i16.nxv2i64(<vscale x 8 x i1>, <vscale x 2 x i64>, i64)
+declare <vscale x 16 x i8> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv16i8.nxv2i64(<vscale x 16 x i1>, <vscale x 2 x i64>, i64)
+declare <vscale x 2 x double> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv2f64.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, i64)
+declare <vscale x 4 x float> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv4f32.nxv2i64(<vscale x 4 x i1>, <vscale x 2 x i64>, i64)
+declare <vscale x 8 x half> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8f16.nxv2i64(<vscale x 8 x i1>, <vscale x 2 x i64>, i64)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8bf16.nxv2i64(<vscale x 8 x i1>, <vscale x 2 x i64>, i64)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1q.gather.index.nxv2i64(<vscale x 2 x i1>, ptr, <vscale x 2 x i64>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.ld1q.gather.index.nxv2f64(<vscale x 2 x i1>, ptr, <vscale x 2 x i64>)
+
+define <vscale x 2 x i64> @test_svld1q_gather_u64index_s64(<vscale x 2 x i1> %pg, ptr noundef %base, <vscale x 2 x i64> %idx) {
+; CHECK-LABEL: test_svld1q_gather_u64index_s64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: lsl z0.d, z0.d, #3
+; CHECK-NEXT: ld1q { z0.q }, p0/z, [z0.d, x0]
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1q.gather.index.nxv2i64(<vscale x 2 x i1> %pg, ptr %base, <vscale x 2 x i64> %idx)
+ ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svld1q_gather_u64index_u64(<vscale x 2 x i1> %pg, ptr noundef %base, <vscale x 2 x i64> %idx) {
+; CHECK-LABEL: test_svld1q_gather_u64index_u64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: lsl z0.d, z0.d, #3
+; CHECK-NEXT: ld1q { z0.q }, p0/z, [z0.d, x0]
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1q.gather.index.nxv2i64(<vscale x 2 x i1> %pg, ptr %base, <vscale x 2 x i64> %idx)
+ ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x double> @test_svld1q_gather_u64index_f64(<vscale x 2 x i1> %pg, ptr noundef %base, <vscale x 2 x i64> %idx) {
+; CHECK-LABEL: test_svld1q_gather_u64index_f64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: lsl z0.d, z0.d, #3
+; CHECK-NEXT: ld1q { z0.q }, p0/z, [z0.d, x0]
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.ld1q.gather.index.nxv2f64(<vscale x 2 x i1> %pg, ptr %base, <vscale x 2 x i64> %idx)
+ ret <vscale x 2 x double> %0
+}
+
+define <vscale x 16 x i8> @test_svld1q_gather_u64base_index_s8(<vscale x 16 x i1> %pg, <vscale x 2 x i64> %base, i64 noundef %idx) {
+; CHECK-LABEL: test_svld1q_gather_u64base_index_s8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ld1q { z0.q }, p0/z, [z0.d, x0]
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv16i8.nxv2i64(<vscale x 16 x i1> %pg, <vscale x 2 x i64> %base, i64 %idx)
+ ret <vscale x 16 x i8> %0
+}
+
+define <vscale x 16 x i8> @test_svld1q_gather_u64base_index_u8(<vscale x 16 x i1> %pg, <vscale x 2 x i64> %base, i64 noundef %idx) {
+; CHECK-LABEL: test_svld1q_gather_u64base_index_u8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ld1q { z0.q }, p0/z, [z0.d, x0]
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv16i8.nxv2i64(<vscale x 16 x i1> %pg, <vscale x 2 x i64> %base, i64 %idx)
+ ret <vscale x 16 x i8> %0
+}
+
+define <vscale x 8 x i16> @test_svld1q_gather_u64base_index_s16(<vscale x 8 x i1> %pg, <vscale x 2 x i64> %base, i64 noundef %idx) {
+; CHECK-LABEL: test_svld1q_gather_u64base_index_s16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: lsl x8, x0, #1
+; CHECK-NEXT: ld1q { z0.q }, p0/z, [z0.d, x8]
+; CHECK-NEXT: ret
+entry:
+ %0 = shl i64 %idx, 1
+ %1 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8i16.nxv2i64(<vscale x 8 x i1> %pg, <vscale x 2 x i64> %base, i64 %0)
+ ret <vscale x 8 x i16> %1
+}
+
+define <vscale x 8 x i16> @test_svld1q_gather_u64base_index_u16(<vscale x 8 x i1> %pg, <vscale x 2 x i64> %base, i64 noundef %idx) {
+; CHECK-LABEL: test_svld1q_gather_u64base_index_u16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: lsl x8, x0, #1
+; CHECK-NEXT: ld1q { z0.q }, p0/z, [z0.d, x8]
+; CHECK-NEXT: ret
+entry:
+ %0 = shl i64 %idx, 1
+ %1 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8i16.nxv2i64(<vscale x 8 x i1> %pg, <vscale x 2 x i64> %base, i64 %0)
+ ret <vscale x 8 x i16> %1
+}
+
+define <vscale x 4 x i32> @test_svld1q_gather_u64base_index_s32(<vscale x 4 x i1> %pg, <vscale x 2 x i64> %base, i64 noundef %idx) {
+; CHECK-LABEL: test_svld1q_gather_u64base_index_s32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: lsl x8, x0, #2
+; CHECK-NEXT: ld1q { z0.q }, p0/z, [z0.d, x8]
+; CHECK-NEXT: ret
+entry:
+ %0 = shl i64 %idx, 2
+ %1 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv4i32.nxv2i64(<vscale x 4 x i1> %pg, <vscale x 2 x i64> %base, i64 %0)
+ ret <vscale x 4 x i32> %1
+}
+
+define <vscale x 4 x i32> @test_svld1q_gather_u64base_index_u32(<vscale x 4 x i1> %pg, <vscale x 2 x i64> %base, i64 noundef %idx) {
+; CHECK-LABEL: test_svld1q_gather_u64base_index_u32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: lsl x8, x0, #2
+; CHECK-NEXT: ld1q { z0.q }, p0/z, [z0.d, x8]
+; CHECK-NEXT: ret
+entry:
+ %0 = shl i64 %idx, 2
+ %1 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv4i32.nxv2i64(<vscale x 4 x i1> %pg, <vscale x 2 x i64> %base, i64 %0)
+ ret <vscale x 4 x i32> %1
+}
+
+define <vscale x 2 x i64> @test_svld1q_gather_u64base_index_s64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 noundef %idx) {
+; CHECK-LABEL: test_svld1q_gather_u64base_index_s64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: lsl x8, x0, #3
+; CHECK-NEXT: ld1q { z0.q }, p0/z, [z0.d, x8]
+; CHECK-NEXT: ret
+entry:
+ %0 = shl i64 %idx, 3
+ %1 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 %0)
+ ret <vscale x 2 x i64> %1
+}
+
+define <vscale x 2 x i64> @test_svld1q_gather_u64base_index_u64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 noundef %idx) {
+; CHECK-LABEL: test_svld1q_gather_u64base_index_u64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: lsl x8, x0, #3
+; CHECK-NEXT: ld1q { z0.q }, p0/z, [z0.d, x8]
+; CHECK-NEXT: ret
+entry:
+ %0 = shl i64 %idx, 3
+ %1 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 %0)
+ ret <vscale x 2 x i64> %1
+}
+
+define <vscale x 8 x bfloat> @test_svld1q_gather_u64base_index_bf16(<vscale x 8 x i1> %pg, <vscale x 2 x i64> %base, i64 noundef %idx) {
+; CHECK-LABEL: test_svld1q_gather_u64base_index_bf16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: lsl x8, x0, #1
+; CHECK-NEXT: ld1q { z0.q }, p0/z, [z0.d, x8]
+; CHECK-NEXT: ret
+entry:
+ %0 = shl i64 %idx, 1
+ %1 = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8bf16.nxv2i64(<vscale x 8 x i1> %pg, <vscale x 2 x i64> %base, i64 %0)
+ ret <vscale x 8 x bfloat> %1
+}
+
+define <vscale x 8 x half> @test_svld1q_gather_u64base_index_f16(<vscale x 8 x i1> %pg, <vscale x 2 x i64> %base, i64 noundef %idx) {
+; CHECK-LABEL: test_svld1q_gather_u64base_index_f16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: lsl x8, x0, #1
+; CHECK-NEXT: ld1q { z0.q }, p0/z, [z0.d, x8]
+; CHECK-NEXT: ret
+entry:
+ %0 = shl i64 %idx, 1
+ %1 = tail call <vscale x 8 x half> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8f16.nxv2i64(<vscale x 8 x i1> %pg, <vscale x 2 x i64> %base, i64 %0)
+ ret <vscale x 8 x half> %1
+}
+
+define <vscale x 4 x float> @test_svld1q_gather_u64base_index_f32(<vscale x 4 x i1> %pg, <vscale x 2 x i64> %base, i64 noundef %idx) {
+; CHECK-LABEL: test_svld1q_gather_u64base_index_f32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: lsl x8, x0, #2
+; CHECK-NEXT: ld1q { z0.q }, p0/z, [z0.d, x8]
+; CHECK-NEXT: ret
+entry:
+ %0 = shl i64 %idx, 2
+ %1 = tail call <vscale x 4 x float> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv4f32.nxv2i64(<vscale x 4 x i1> %pg, <vscale x 2 x i64> %base, i64 %0)
+ ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @test_svld1q_gather_u64base_index_f64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 noundef %idx) {
+; CHECK-LABEL: test_svld1q_gather_u64base_index_f64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: lsl x8, x0, #3
+; CHECK-NEXT: ld1q { z0.q }, p0/z, [z0.d, x8]
+; CHECK-NEXT: ret
+entry:
+ %0 = shl i64 %idx, 3
+ %1 = tail call <vscale x 2 x double> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv2f64.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 %0)
+ ret <vscale x 2 x double> %1
+}
+
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-scatter-stores-128bit-index.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-scatter-stores-128bit-index.ll
new file mode 100644
index 000000000000000..3792962a4987b5e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-scatter-stores-128bit-index.ll
@@ -0,0 +1,186 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1,+bf16 < %s | FileCheck %s
+
+declare void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, <vscale x 2 x i64>, i64)
+declare void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv4i32.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i1>, <vscale x 2 x i64>, i64)
+declare void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8i16.nxv2i64(<vscale x 8 x i16>, <vscale x 8 x i1>, <vscale x 2 x i64>, i64)
+declare void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv16i8.nxv2i64(<vscale x 16 x i8>, <vscale x 16 x i1>, <vscale x 2 x i64>, i64)
+declare void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2f64.nxv2i64(<vscale x 2 x double>, <vscale x 2 x i1>, <vscale x 2 x i64>, i64)
+declare void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv4f32.nxv2i64(<vscale x 4 x float>, <vscale x 4 x i1>, <vscale x 2 x i64>, i64)
+declare void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8f16.nxv2i64(<vscale x 8 x half>, <vscale x 8 x i1>, <vscale x 2 x i64>, i64)
+declare void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8bf16.nxv2i64(<vscale x 8 x bfloat>, <vscale x 8 x i1>, <vscale x 2 x i64>, i64)
+declare void @llvm.aarch64.sve.st1q.scatter.index.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, ptr, <vscale x 2 x i64>)
+declare void @llvm.aarch64.sve.st1q.scatter.index.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, ptr, <vscale x 2 x i64>)
+
+define void @test_svst1q_scatter_u64index_s64(<vscale x 2 x i1> %pg, ptr noundef %base, <vscale x 2 x i64> %idx, <vscale x 2 x i64> %data) {
+; CHECK-LABEL: test_svst1q_scatter_u64index_s64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: lsl z0.d, z0.d, #3
+; CHECK-NEXT: st1q { z1.q }, p0, [z0.d, x0]
+; CHECK-NEXT: ret
+entry:
+ tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv2i64(<vscale x 2 x i64> %data, <vscale x 2 x i1> %pg, ptr %base, <vscale x 2 x i64> %idx)
+ ret void
+}
+
+define void @test_svst1q_scatter_u64index_u64(<vscale x 2 x i1> %pg, ptr noundef %base, <vscale x 2 x i64> %idx, <vscale x 2 x i64> %data) {
+; CHECK-LABEL: test_svst1q_scatter_u64index_u64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: lsl z0.d, z0.d, #3
+; CHECK-NEXT: st1q { z1.q }, p0, [z0.d, x0]
+; CHECK-NEXT: ret
+entry:
+ tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv2i64(<vscale x 2 x i64> %data, <vscale x 2 x i1> %pg, ptr %base, <vscale x 2 x i64> %idx)
+ ret void
+}
+
+define void @test_svst1q_scatter_u64index_f64(<vscale x 2 x i1> %pg, ptr noundef %base, <vscale x 2 x i64> %idx, <vscale x 2 x double> %data) {
+; CHECK-LABEL: test_svst1q_scatter_u64index_f64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: lsl z0.d, z0.d, #3
+; CHECK-NEXT: st1q { z1.q }, p0, [z0.d, x0]
+; CHECK-NEXT: ret
+entry:
+ tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv2f64(<vscale x 2 x double> %data, <vscale x 2 x i1> %pg, ptr %base, <vscale x 2 x i64> %idx)
+ ret void
+}
+
+define void @test_svst1q_scatter_u64base_index_s8(<vscale x 16 x i1> %pg, <vscale x 2 x i64> %base, i64 noundef %idx, <vscale x 16 x i8> %data) {
+; CHECK-LABEL: test_svst1q_scatter_u64base_index_s8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: st1q { z1.q }, p0, [z0.d, x0]
+; CHECK-NEXT: ret
+entry:
+ tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv16i8.nxv2i64(<vscale x 16 x i8> %data, <vscale x 16 x i1> %pg, <vscale x 2 x i64> %base, i64 %idx)
+ ret void
+}
+
+define void @test_svst1q_scatter_u64base_index_u8(<vscale x 16 x i1> %pg, <vscale x 2 x i64> %base, i64 noundef %idx, <vscale x 16 x i8> %data) {
+; CHECK-LABEL: test_svst1q_scatter_u64base_index_u8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: st1q { z1.q }, p0, [z0.d, x0]
+; CHECK-NEXT: ret
+entry:
+ tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv16i8.nxv2i64(<vscale x 16 x i8> %data, <vscale x 16 x i1> %pg, <vscale x 2 x i64> %base, i64 %idx)
+ ret void
+}
+
+define void @test_svst1q_scatter_u64base_index_s16(<vscale x 8 x i1> %pg, <vscale x 2 x i64> %base, i64 noundef %idx, <vscale x 8 x i16> %data) {
+; CHECK-LABEL: test_svst1q_scatter_u64base_index_s16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: lsl x8, x0, #1
+; CHECK-NEXT: st1q { z1.q }, p0, [z0.d, x8]
+; CHECK-NEXT: ret
+entry:
+ %0 = shl i64 %idx, 1
+ tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8i16.nxv2i64(<vscale x 8 x i16> %data, <vscale x 8 x i1> %pg, <vscale x 2 x i64> %base, i64 %0)
+ ret void
+}
+
+define void @test_svst1q_scatter_u64base_index_u16(<vscale x 8 x i1> %pg, <vscale x 2 x i64> %base, i64 noundef %idx, <vscale x 8 x i16> %data) {
+; CHECK-LABEL: test_svst1q_scatter_u64base_index_u16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: lsl x8, x0, #1
+; CHECK-NEXT: st1q { z1.q }, p0, [z0.d, x8]
+; CHECK-NEXT: ret
+entry:
+ %0 = shl i64 %idx, 1
+ tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8i16.nxv2i64(<vscale x 8 x i16> %data, <vscale x 8 x i1> %pg, <vscale x 2 x i64> %base, i64 %0)
+ ret void
+}
+
+define void @test_svst1q_scatter_u64base_index_s32(<vscale x 4 x i1> %pg, <vscale x 2 x i64> %base, i64 noundef %idx, <vscale x 4 x i32> %data) {
+; CHECK-LABEL: test_svst1q_scatter_u64base_index_s32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: lsl x8, x0, #2
+; CHECK-NEXT: st1q { z1.q }, p0, [z0.d, x8]
+; CHECK-NEXT: ret
+entry:
+ %0 = shl i64 %idx, 2
+ tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv4i32.nxv2i64(<vscale x 4 x i32> %data, <vscale x 4 x i1> %pg, <vscale x 2 x i64> %base, i64 %0)
+ ret void
+}
+
+define void @test_svst1q_scatter_u64base_index_u32(<vscale x 4 x i1> %pg, <vscale x 2 x i64> %base, i64 noundef %idx, <vscale x 4 x i32> %data) {
+; CHECK-LABEL: test_svst1q_scatter_u64base_index_u32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: lsl x8, x0, #2
+; CHECK-NEXT: st1q { z1.q }, p0, [z0.d, x8]
+; CHECK-NEXT: ret
+entry:
+ %0 = shl i64 %idx, 2
+ tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv4i32.nxv2i64(<vscale x 4 x i32> %data, <vscale x 4 x i1> %pg, <vscale x 2 x i64> %base, i64 %0)
+ ret void
+}
+
+define void @test_svst1q_scatter_u64base_index_s64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 noundef %idx, <vscale x 2 x i64> %data) {
+; CHECK-LABEL: test_svst1q_scatter_u64base_index_s64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: lsl x8, x0, #3
+; CHECK-NEXT: st1q { z1.q }, p0, [z0.d, x8]
+; CHECK-NEXT: ret
+entry:
+ %0 = shl i64 %idx, 3
+ tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i64> %data, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 %0)
+ ret void
+}
+
+define void @test_svst1q_scatter_u64base_index_u64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 noundef %idx, <vscale x 2 x i64> %data) {
+; CHECK-LABEL: test_svst1q_scatter_u64base_index_u64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: lsl x8, x0, #3
+; CHECK-NEXT: st1q { z1.q }, p0, [z0.d, x8]
+; CHECK-NEXT: ret
+entry:
+ %0 = shl i64 %idx, 3
+ tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i64> %data, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 %0)
+ ret void
+}
+
+define void @test_svst1q_scatter_u64base_index_bf16(<vscale x 8 x i1> %pg, <vscale x 2 x i64> %base, i64 noundef %idx, <vscale x 8 x bfloat> %data) {
+; CHECK-LABEL: test_svst1q_scatter_u64base_index_bf16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: lsl x8, x0, #1
+; CHECK-NEXT: st1q { z1.q }, p0, [z0.d, x8]
+; CHECK-NEXT: ret
+entry:
+ %0 = shl i64 %idx, 1
+ tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8bf16.nxv2i64(<vscale x 8 x bfloat> %data, <vscale x 8 x i1> %pg, <vscale x 2 x i64> %base, i64 %0)
+ ret void
+}
+
+define void @test_svst1q_scatter_u64base_index_f16(<vscale x 8 x i1> %pg, <vscale x 2 x i64> %base, i64 noundef %idx, <vscale x 8 x half> %data) {
+; CHECK-LABEL: test_svst1q_scatter_u64base_index_f16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: lsl x8, x0, #1
+; CHECK-NEXT: st1q { z1.q }, p0, [z0.d, x8]
+; CHECK-NEXT: ret
+entry:
+ %0 = shl i64 %idx, 1
+ tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8f16.nxv2i64(<vscale x 8 x half> %data, <vscale x 8 x i1> %pg, <vscale x 2 x i64> %base, i64 %0)
+ ret void
+}
+
+define void @test_svst1q_scatter_u64base_index_f32(<vscale x 4 x i1> %pg, <vscale x 2 x i64> %base, i64 noundef %idx, <vscale x 4 x float> %data) {
+; CHECK-LABEL: test_svst1q_scatter_u64base_index_f32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: lsl x8, x0, #2
+; CHECK-NEXT: st1q { z1.q }, p0, [z0.d, x8]
+; CHECK-NEXT: ret
+entry:
+ %0 = shl i64 %idx, 2
+ tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv4f32.nxv2i64(<vscale x 4 x float> %data, <vscale x 4 x i1> %pg, <vscale x 2 x i64> %base, i64 %0)
+ ret void
+}
+
+define void @test_svst1q_scatter_u64base_index_f64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 noundef %idx, <vscale x 2 x double> %data) {
+; CHECK-LABEL: test_svst1q_scatter_u64base_index_f64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: lsl x8, x0, #3
+; CHECK-NEXT: st1q { z1.q }, p0, [z0.d, x8]
+; CHECK-NEXT: ret
+entry:
+ %0 = shl i64 %idx, 3
+ tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2f64.nxv2i64(<vscale x 2 x double> %data, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 %0)
+ ret void
+}
More information about the cfe-commits
mailing list