[clang] [llvm] [AArch64] Add quadword gather load/scatter store intrinsics with unscaled vector offset (PR #71290)

Tue Nov 21 08:17:57 PST 2023

https://github.com/momchil-velikov updated https://github.com/llvm/llvm-project/pull/71290

>From 9f90ac3383b37e9d2310836527d01a94b6fbadb9 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Fri, 27 Oct 2023 16:09:07 +0100
Subject: [PATCH 1/4] [AArch64] Add SVE2.1 intrinsics for indexed quadword
 gather loads and scatter stores

This patch adds the quadword gather load intrinsics of the form

  (1) sv<type>_t svld1q_gather_u64index_<typ>(svbool_t, const <type>_t *, svuint64_t);
  (2) sv<type>_t svld1q_gather_u64base_index_<typ>(svbool_t, svuint64_t, int64_t);

and the quadword scatter store intrinsics of the form

  (3) void svst1q_scatter_u64index_<typ>(svbool_t, <type>_t *, svuint64_t, sv<type>_t);
  (4) void svst1q_scatter_u64base_index_<typ>(svbool, svuint64_t, int64_t, sv<type>_t);

(intrinsics (1) and (3) are currently missing the variants for non 64-bit sized
base types, e.g. `int8_t` or `bfloat16_t`, etc).

ACLE spec: https://github.com/ARM-software/acle/pull/257
---
 clang/include/clang/Basic/arm_sve.td          |  12 +
 .../acle_sve2p1_loads.c                       | 340 ++++++++++++++++++
 .../acle_sve2p1_store.c                       | 340 ++++++++++++++++++
 llvm/include/llvm/IR/IntrinsicsAArch64.td     |  25 ++
 .../Target/AArch64/AArch64ISelLowering.cpp    |  38 +-
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |   2 +
 ...p1-intrinsics-gather-loads-128bit-index.ll | 249 +++++++++++++
 ...-intrinsics-scatter-stores-128bit-index.ll | 248 +++++++++++++
 8 files changed, 1244 insertions(+), 10 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/sve2p1-intrinsics-gather-loads-128bit-index.ll
 create mode 100644 llvm/test/CodeGen/AArch64/sve2p1-intrinsics-scatter-stores-128bit-index.ll

diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index 07d9fc6e04f1b29..ee46a81a942c3f3 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -319,6 +319,12 @@ let TargetGuard = "sve2p1" in {
   defm SVLD2Q_VNUM : StructLoad<"svld2q_vnum[_{2}]", "2Pcl", "aarch64_sve_ld2q_sret">;
   defm SVLD3Q_VNUM : StructLoad<"svld3q_vnum[_{2}]", "3Pcl", "aarch64_sve_ld3q_sret">;
   defm SVLD4Q_VNUM : StructLoad<"svld4q_vnum[_{2}]", "4Pcl", "aarch64_sve_ld4q_sret">;
+
+  // Load quadwords (scalar base + vector index)
+  def SVLD1Q_GATHER_INDICES_U : MInst<"svld1q_gather_[{3}]index[_{0}]",    "dPcg", "sUsiUilUlbhfd", [IsGatherLoad], MemEltTyDefault, "aarch64_sve_ld1q_gather_index">;
+
+  // Load quadwords (vector base + scalar index)
+  def SVLD1Q_GATHER_INDEX_S   : MInst<"svld1q_gather[_{2}base]_index_{0}", "dPgl", "sUsiUilUlbhfd", [IsGatherLoad], MemEltTyDefault, "aarch64_sve_ld1q_gather_scalar_offset">;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -464,6 +470,12 @@ let TargetGuard = "sve2p1" in {
   defm SVST2Q_VNUM : StructStore<"svst2q_vnum[_{d}]", "vPcl2", "aarch64_sve_st2q">;
   defm SVST3Q_VNUM : StructStore<"svst3q_vnum[_{d}]", "vPcl3", "aarch64_sve_st3q">;
   defm SVST4Q_VNUM : StructStore<"svst4q_vnum[_{d}]", "vPcl4", "aarch64_sve_st4q">;
+
+  // Scatter store quadwords (scalar base + vector index)
+  def SVST1Q_SCATTER_INDICES_U : MInst<"svst1q_scatter_[{3}]index[_{0}]",    "vPpgd", "sUsiUilUlbhfd", [IsScatterStore], MemEltTyDefault, "aarch64_sve_st1q_scatter_index">;
+
+  // Scatter store quadwords (vector base + scalar index)
+  def SVST1Q_SCATTER_INDEX_S   : MInst<"svst1q_scatter[_{2}base]_index[_{0}]", "vPgld", "sUsiUilUlbhfd", [IsScatterStore], MemEltTyDefault, "aarch64_sve_st1q_scatter_scalar_offset">;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_loads.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_loads.c
index 35e0069e17c136f..44351347e4cf0ae 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_loads.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_loads.c
@@ -2528,3 +2528,343 @@ svbfloat16_t test_svld1q_gather_u64base_bf16(svbool_t pg, svuint64_t base)
 {
   return SVE_ACLE_FUNC(svld1q_gather,_u64base,_bf16,)(pg, base);
 }
+
+// CHECK-LABEL: @test_svld1q_gather_u64index_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.ld1q.gather.index.nxv8i16(<vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z31test_svld1q_gather_u64index_s16u10__SVBool_tPKsu12__SVUint64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.ld1q.gather.index.nxv8i16(<vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
+//
+svint16_t test_svld1q_gather_u64index_s16(svbool_t pg, const int16_t *base, svuint64_t idx) {
+  return SVE_ACLE_FUNC(svld1q_gather_,u64,index,_s16) (pg, base, idx);
+}
+
+// CHECK-LABEL: @test_svld1q_gather_u64index_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.ld1q.gather.index.nxv8i16(<vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z31test_svld1q_gather_u64index_u16u10__SVBool_tPKtu12__SVUint64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.ld1q.gather.index.nxv8i16(<vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
+//
+svuint16_t test_svld1q_gather_u64index_u16(svbool_t pg, const uint16_t *base, svuint64_t idx) {
+  return SVE_ACLE_FUNC(svld1q_gather_,u64,index,_u16) (pg, base, idx);
+}
+
+// CHECK-LABEL: @test_svld1q_gather_u64index_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1q.gather.index.nxv4i32(<vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z31test_svld1q_gather_u64index_s32u10__SVBool_tPKiu12__SVUint64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1q.gather.index.nxv4i32(<vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
+//
+svint32_t test_svld1q_gather_u64index_s32(svbool_t pg, const int32_t *base, svuint64_t idx) {
+  return SVE_ACLE_FUNC(svld1q_gather_,u64,index,_s32) (pg, base, idx);
+}
+
+// CHECK-LABEL: @test_svld1q_gather_u64index_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1q.gather.index.nxv4i32(<vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z31test_svld1q_gather_u64index_u32u10__SVBool_tPKju12__SVUint64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1q.gather.index.nxv4i32(<vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
+//
+svuint32_t test_svld1q_gather_u64index_u32(svbool_t pg, const uint32_t *base, svuint64_t idx) {
+  return SVE_ACLE_FUNC(svld1q_gather_,u64,index,_u32) (pg, base, idx);
+}
+
+// CHECK-LABEL: @test_svld1q_gather_u64index_s64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1q.gather.index.nxv2i64(<vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z31test_svld1q_gather_u64index_s64u10__SVBool_tPKlu12__SVUint64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1q.gather.index.nxv2i64(<vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
+//
+svint64_t test_svld1q_gather_u64index_s64(svbool_t pg, const int64_t *base, svuint64_t idx) {
+  return SVE_ACLE_FUNC(svld1q_gather_,u64,index,_s64) (pg, base, idx);
+}
+
+// CHECK-LABEL: @test_svld1q_gather_u64index_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1q.gather.index.nxv2i64(<vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z31test_svld1q_gather_u64index_u64u10__SVBool_tPKmu12__SVUint64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1q.gather.index.nxv2i64(<vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
+//
+svuint64_t test_svld1q_gather_u64index_u64(svbool_t pg, const uint64_t *base, svuint64_t idx) {
+  return SVE_ACLE_FUNC(svld1q_gather_,u64,index,_u64) (pg, base, idx);
+}
+
+// CHECK-LABEL: @test_svld1q_gather_u64index_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1q.gather.index.nxv8bf16(<vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z32test_svld1q_gather_u64index_bf16u10__SVBool_tPKu6__bf16u12__SVUint64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1q.gather.index.nxv8bf16(<vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svld1q_gather_u64index_bf16(svbool_t pg, const bfloat16_t *base, svuint64_t idx) {
+  return SVE_ACLE_FUNC(svld1q_gather_,u64,index,_bf16) (pg, base, idx);
+}
+
+// CHECK-LABEL: @test_svld1q_gather_u64index_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.ld1q.gather.index.nxv8f16(<vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x half> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z31test_svld1q_gather_u64index_f16u10__SVBool_tPKDhu12__SVUint64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.ld1q.gather.index.nxv8f16(<vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP1]]
+//
+svfloat16_t test_svld1q_gather_u64index_f16(svbool_t pg, const float16_t *base, svuint64_t idx) {
+  return SVE_ACLE_FUNC(svld1q_gather_,u64,index,_f16) (pg, base, idx);
+}
+
+// CHECK-LABEL: @test_svld1q_gather_u64index_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.ld1q.gather.index.nxv4f32(<vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z31test_svld1q_gather_u64index_f32u10__SVBool_tPKfu12__SVUint64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.ld1q.gather.index.nxv4f32(<vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+//
+svfloat32_t test_svld1q_gather_u64index_f32(svbool_t pg, const float32_t *base, svuint64_t idx) {
+  return SVE_ACLE_FUNC(svld1q_gather_,u64,index,_f32) (pg, base, idx);
+}
+
+// CHECK-LABEL: @test_svld1q_gather_u64index_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.ld1q.gather.index.nxv2f64(<vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z31test_svld1q_gather_u64index_f64u10__SVBool_tPKdu12__SVUint64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.ld1q.gather.index.nxv2f64(<vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+//
+svfloat64_t test_svld1q_gather_u64index_f64(svbool_t pg, const float64_t *base, svuint64_t idx) {
+  return SVE_ACLE_FUNC(svld1q_gather_,u64,index,_f64) (pg, base, idx);
+}
+
+// CHECK-LABEL: @test_svld1q_gather_u64base_index_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8i16.nxv2i64(<vscale x 1 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z36test_svld1q_gather_u64base_index_s16u10__SVBool_tu12__SVUint64_tl(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 1
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8i16.nxv2i64(<vscale x 1 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
+//
+svint16_t test_svld1q_gather_u64base_index_s16(svbool_t pg, svuint64_t base, int64_t idx) {
+  return SVE_ACLE_FUNC(svld1q_gather,_u64base,_index_s16,)(pg, base, idx);
+}
+
+// CHECK-LABEL: @test_svld1q_gather_u64base_index_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8i16.nxv2i64(<vscale x 1 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z36test_svld1q_gather_u64base_index_u16u10__SVBool_tu12__SVUint64_tl(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 1
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8i16.nxv2i64(<vscale x 1 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
+//
+svuint16_t test_svld1q_gather_u64base_index_u16(svbool_t pg, svuint64_t base, int64_t idx) {
+  return SVE_ACLE_FUNC(svld1q_gather,_u64base,_index_u16,)(pg, base, idx);
+}
+
+// CHECK-LABEL: @test_svld1q_gather_u64base_index_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv4i32.nxv2i64(<vscale x 1 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z36test_svld1q_gather_u64base_index_s32u10__SVBool_tu12__SVUint64_tl(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 2
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv4i32.nxv2i64(<vscale x 1 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
+//
+svint32_t test_svld1q_gather_u64base_index_s32(svbool_t pg, svuint64_t base, int64_t idx) {
+  return SVE_ACLE_FUNC(svld1q_gather,_u64base,_index_s32,)(pg, base, idx);
+}
+
+// CHECK-LABEL: @test_svld1q_gather_u64base_index_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv4i32.nxv2i64(<vscale x 1 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z36test_svld1q_gather_u64base_index_u32u10__SVBool_tu12__SVUint64_tl(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 2
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv4i32.nxv2i64(<vscale x 1 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
+//
+svuint32_t test_svld1q_gather_u64base_index_u32(svbool_t pg, svuint64_t base, int64_t idx) {
+  return SVE_ACLE_FUNC(svld1q_gather,_u64base,_index_u32,)(pg, base, idx);
+}
+
+// CHECK-LABEL: @test_svld1q_gather_u64base_index_s64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 3
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv2i64.nxv2i64(<vscale x 1 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z36test_svld1q_gather_u64base_index_s64u10__SVBool_tu12__SVUint64_tl(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 3
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv2i64.nxv2i64(<vscale x 1 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
+//
+svint64_t test_svld1q_gather_u64base_index_s64(svbool_t pg, svuint64_t base, int64_t idx) {
+  return SVE_ACLE_FUNC(svld1q_gather,_u64base,_index_s64,)(pg, base, idx);
+}
+
+// CHECK-LABEL: @test_svld1q_gather_u64base_index_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 3
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv2i64.nxv2i64(<vscale x 1 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z36test_svld1q_gather_u64base_index_u64u10__SVBool_tu12__SVUint64_tl(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 3
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv2i64.nxv2i64(<vscale x 1 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
+//
+svuint64_t test_svld1q_gather_u64base_index_u64(svbool_t pg, svuint64_t base, int64_t idx) {
+  return SVE_ACLE_FUNC(svld1q_gather,_u64base,_index_u64,)(pg, base, idx);
+}
+
+// CHECK-LABEL: @test_svld1q_gather_u64base_index_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8bf16.nxv2i64(<vscale x 1 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z37test_svld1q_gather_u64base_index_bf16u10__SVBool_tu12__SVUint64_tl(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 1
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8bf16.nxv2i64(<vscale x 1 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+//
+svbfloat16_t test_svld1q_gather_u64base_index_bf16(svbool_t pg, svuint64_t base, int64_t idx) {
+  return SVE_ACLE_FUNC(svld1q_gather,_u64base,_index_bf16,)(pg, base, idx);
+}
+
+// CHECK-LABEL: @test_svld1q_gather_u64base_index_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8f16.nxv2i64(<vscale x 1 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CHECK-NEXT:    ret <vscale x 8 x half> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z36test_svld1q_gather_u64base_index_f16u10__SVBool_tu12__SVUint64_tl(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 1
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8f16.nxv2i64(<vscale x 1 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP2]]
+//
+svfloat16_t test_svld1q_gather_u64base_index_f16(svbool_t pg, svuint64_t base, int64_t idx) {
+  return SVE_ACLE_FUNC(svld1q_gather,_u64base,_index_f16,)(pg, base, idx);
+}
+
+// CHECK-LABEL: @test_svld1q_gather_u64base_index_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv4f32.nxv2i64(<vscale x 1 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CHECK-NEXT:    ret <vscale x 4 x float> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z36test_svld1q_gather_u64base_index_f32u10__SVBool_tu12__SVUint64_tl(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 2
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv4f32.nxv2i64(<vscale x 1 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CPP-CHECK-NEXT:    ret <vscale x 4 x float> [[TMP2]]
+//
+svfloat32_t test_svld1q_gather_u64base_index_f32(svbool_t pg, svuint64_t base, int64_t idx) {
+  return SVE_ACLE_FUNC(svld1q_gather,_u64base,_index_f32,)(pg, base, idx);
+}
+
+// CHECK-LABEL: @test_svld1q_gather_u64base_index_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 3
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv2f64.nxv2i64(<vscale x 1 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CHECK-NEXT:    ret <vscale x 2 x double> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z36test_svld1q_gather_u64base_index_f64u10__SVBool_tu12__SVUint64_tl(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 3
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv2f64.nxv2i64(<vscale x 1 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CPP-CHECK-NEXT:    ret <vscale x 2 x double> [[TMP2]]
+//
+svfloat64_t test_svld1q_gather_u64base_index_f64(svbool_t pg, svuint64_t base, int64_t idx) {
+  return SVE_ACLE_FUNC(svld1q_gather,_u64base,_index_f64,)(pg, base, idx);
+}
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_store.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_store.c
index 1fb5933ce75e1e3..137801cc0814a10 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_store.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_store.c
@@ -2130,3 +2130,343 @@ void test_svst1q_scatter_u64base_bf16(svbool_t pg, svuint64_t base, svbfloat16_t
 {
   SVE_ACLE_FUNC(svst1q_scatter, _u64base,,_bf16)(pg, base, data);
 }
+
+// CHECK-LABEL: @test_svst1q_scatter_u64index_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv8i16(<vscale x 8 x i16> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svst1q_scatter_u64index_s16u10__SVBool_tPsu12__SVUint64_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv8i16(<vscale x 8 x i16> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1q_scatter_u64index_s16(svbool_t pg, int16_t *base, svuint64_t idx, svint16_t data) {
+  SVE_ACLE_FUNC(svst1q_scatter_, u64, index, _s16)(pg, base, idx, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64index_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv8i16(<vscale x 8 x i16> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svst1q_scatter_u64index_u16u10__SVBool_tPtu12__SVUint64_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv8i16(<vscale x 8 x i16> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1q_scatter_u64index_u16(svbool_t pg, uint16_t *base, svuint64_t idx, svuint16_t data) {
+  SVE_ACLE_FUNC(svst1q_scatter_, u64, index, _u16)(pg, base, idx, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64index_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv4i32(<vscale x 4 x i32> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svst1q_scatter_u64index_s32u10__SVBool_tPiu12__SVUint64_tu11__SVInt32_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv4i32(<vscale x 4 x i32> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1q_scatter_u64index_s32(svbool_t pg, int32_t *base, svuint64_t idx, svint32_t data) {
+  SVE_ACLE_FUNC(svst1q_scatter_, u64, index, _s32)(pg, base, idx, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64index_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv4i32(<vscale x 4 x i32> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svst1q_scatter_u64index_u32u10__SVBool_tPju12__SVUint64_tu12__SVUint32_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv4i32(<vscale x 4 x i32> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1q_scatter_u64index_u32(svbool_t pg, uint32_t *base, svuint64_t idx, svuint32_t data) {
+  SVE_ACLE_FUNC(svst1q_scatter_, u64, index, _u32)(pg, base, idx, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64index_s64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv2i64(<vscale x 2 x i64> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svst1q_scatter_u64index_s64u10__SVBool_tPlu12__SVUint64_tu11__SVInt64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv2i64(<vscale x 2 x i64> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1q_scatter_u64index_s64(svbool_t pg, int64_t *base, svuint64_t idx, svint64_t data) {
+  SVE_ACLE_FUNC(svst1q_scatter_, u64, index, _s64)(pg, base, idx, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64index_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv2i64(<vscale x 2 x i64> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svst1q_scatter_u64index_u64u10__SVBool_tPmu12__SVUint64_tS1_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv2i64(<vscale x 2 x i64> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1q_scatter_u64index_u64(svbool_t pg, uint64_t *base, svuint64_t idx, svuint64_t data) {
+  SVE_ACLE_FUNC(svst1q_scatter_, u64, index, _u64)(pg, base, idx, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64index_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv8bf16(<vscale x 8 x bfloat> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z33test_svst1q_scatter_u64index_bf16u10__SVBool_tPu6__bf16u12__SVUint64_tu14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv8bf16(<vscale x 8 x bfloat> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1q_scatter_u64index_bf16(svbool_t pg, bfloat16_t *base, svuint64_t idx, svbfloat16_t data) {
+  SVE_ACLE_FUNC(svst1q_scatter_, u64, index, _bf16)(pg, base, idx, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64index_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv8f16(<vscale x 8 x half> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svst1q_scatter_u64index_f16u10__SVBool_tPDhu12__SVUint64_tu13__SVFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv8f16(<vscale x 8 x half> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1q_scatter_u64index_f16(svbool_t pg, float16_t *base, svuint64_t idx, svfloat16_t data) {
+  SVE_ACLE_FUNC(svst1q_scatter_, u64, index, _f16)(pg, base, idx, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64index_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv4f32(<vscale x 4 x float> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svst1q_scatter_u64index_f32u10__SVBool_tPfu12__SVUint64_tu13__SVFloat32_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv4f32(<vscale x 4 x float> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1q_scatter_u64index_f32(svbool_t pg, float32_t *base, svuint64_t idx, svfloat32_t data) {
+  SVE_ACLE_FUNC(svst1q_scatter_, u64, index, _f32)(pg, base, idx, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64index_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv2f64(<vscale x 2 x double> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svst1q_scatter_u64index_f64u10__SVBool_tPdu12__SVUint64_tu13__SVFloat64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv2f64(<vscale x 2 x double> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1q_scatter_u64index_f64(svbool_t pg, float64_t *base, svuint64_t idx, svfloat64_t data) {
+  SVE_ACLE_FUNC(svst1q_scatter_, u64, index, _f64)(pg, base, idx, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64base_index_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 1
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8i16.nxv2i64(<vscale x 8 x i16> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z37test_svst1q_scatter_u64base_index_s16u10__SVBool_tu12__SVUint64_tlu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 1
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8i16.nxv2i64(<vscale x 8 x i16> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1q_scatter_u64base_index_s16(svbool_t pg, svuint64_t base, int64_t idx, svint16_t data) {
+  SVE_ACLE_FUNC(svst1q_scatter,_u64base,_index,_s16)(pg, base, idx, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64base_index_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 1
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8i16.nxv2i64(<vscale x 8 x i16> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z37test_svst1q_scatter_u64base_index_u16u10__SVBool_tu12__SVUint64_tlu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 1
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8i16.nxv2i64(<vscale x 8 x i16> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1q_scatter_u64base_index_u16(svbool_t pg, svuint64_t base, int64_t idx, svuint16_t data) {
+  SVE_ACLE_FUNC(svst1q_scatter,_u64base,_index,_u16)(pg, base, idx, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64base_index_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 2
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv4i32.nxv2i64(<vscale x 4 x i32> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z37test_svst1q_scatter_u64base_index_s32u10__SVBool_tu12__SVUint64_tlu11__SVInt32_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 2
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv4i32.nxv2i64(<vscale x 4 x i32> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1q_scatter_u64base_index_s32(svbool_t pg, svuint64_t base, int64_t idx, svint32_t data) {
+  SVE_ACLE_FUNC(svst1q_scatter,_u64base,_index,_s32)(pg, base, idx, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64base_index_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 2
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv4i32.nxv2i64(<vscale x 4 x i32> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z37test_svst1q_scatter_u64base_index_u32u10__SVBool_tu12__SVUint64_tlu12__SVUint32_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 2
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv4i32.nxv2i64(<vscale x 4 x i32> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1q_scatter_u64base_index_u32(svbool_t pg, svuint64_t base, int64_t idx, svuint32_t data) {
+  SVE_ACLE_FUNC(svst1q_scatter,_u64base,_index,_u32)(pg, base, idx, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64base_index_s64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 3
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i64> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z37test_svst1q_scatter_u64base_index_s64u10__SVBool_tu12__SVUint64_tlu11__SVInt64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 3
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i64> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1q_scatter_u64base_index_s64(svbool_t pg, svuint64_t base, int64_t idx, svint64_t data) {
+  SVE_ACLE_FUNC(svst1q_scatter,_u64base,_index,_s64)(pg, base, idx, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64base_index_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 3
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i64> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z37test_svst1q_scatter_u64base_index_u64u10__SVBool_tu12__SVUint64_tlS0_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 3
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i64> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1q_scatter_u64base_index_u64(svbool_t pg, svuint64_t base, int64_t idx, svuint64_t data) {
+  SVE_ACLE_FUNC(svst1q_scatter,_u64base,_index,_u64)(pg, base, idx, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64base_index_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 1
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8bf16.nxv2i64(<vscale x 8 x bfloat> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z38test_svst1q_scatter_u64base_index_bf16u10__SVBool_tu12__SVUint64_tlu14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 1
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8bf16.nxv2i64(<vscale x 8 x bfloat> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1q_scatter_u64base_index_bf16(svbool_t pg, svuint64_t base, int64_t idx, svbfloat16_t data) {
+  SVE_ACLE_FUNC(svst1q_scatter,_u64base,_index,_bf16)(pg, base, idx, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64base_index_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 1
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8f16.nxv2i64(<vscale x 8 x half> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z37test_svst1q_scatter_u64base_index_f16u10__SVBool_tu12__SVUint64_tlu13__SVFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 1
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8f16.nxv2i64(<vscale x 8 x half> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1q_scatter_u64base_index_f16(svbool_t pg, svuint64_t base, int64_t idx, svfloat16_t data) {
+  SVE_ACLE_FUNC(svst1q_scatter,_u64base,_index,_f16)(pg, base, idx, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64base_index_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 2
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv4f32.nxv2i64(<vscale x 4 x float> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z37test_svst1q_scatter_u64base_index_f32u10__SVBool_tu12__SVUint64_tlu13__SVFloat32_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 2
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv4f32.nxv2i64(<vscale x 4 x float> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1q_scatter_u64base_index_f32(svbool_t pg, svuint64_t base, int64_t idx, svfloat32_t data) {
+  SVE_ACLE_FUNC(svst1q_scatter,_u64base,_index,_f32)(pg, base, idx, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64base_index_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 3
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2f64.nxv2i64(<vscale x 2 x double> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z37test_svst1q_scatter_u64base_index_f64u10__SVBool_tu12__SVUint64_tlu13__SVFloat64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[IDX:%.*]], 3
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2f64.nxv2i64(<vscale x 2 x double> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], <vscale x 2 x i64> [[BASE:%.*]], i64 [[TMP1]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1q_scatter_u64base_index_f64(svbool_t pg, svuint64_t base, int64_t idx, svfloat64_t data) {
+  SVE_ACLE_FUNC(svst1q_scatter,_u64base,_index,_f64)(pg, base, idx, data);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 35129d6b6c16bdc..a558a1eca84af6c 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -1466,6 +1466,15 @@ class AdvSIMD_GatherLoadQ_VS_Intrinsic
                 ],
                 [IntrReadMem]>;
 
+class AdvSIMD_GatherLoadQ_SV_Intrinsic
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                [
+                  llvm_nxv1i1_ty,
+                  llvm_ptr_ty,
+                  llvm_nxv2i64_ty
+                ],
+                [IntrReadMem, IntrArgMemOnly]>;
+
 class AdvSIMD_GatherLoad_VS_WriteFFR_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [
@@ -1514,6 +1523,16 @@ class AdvSIMD_ScatterStoreQ_VS_Intrinsic
                ],
                [IntrWriteMem]>;
 
+class AdvSIMD_ScatterStoreQ_SV_Intrinsic
+    : DefaultAttrsIntrinsic<[],
+               [
+                 llvm_anyvector_ty,
+                 llvm_nxv1i1_ty,
+                 llvm_ptr_ty,
+                 llvm_nxv2i64_ty
+               ],
+               [IntrWriteMem, IntrArgMemOnly]>;
+
 class SVE_gather_prf_SV
     : DefaultAttrsIntrinsic<[],
                 [
@@ -2144,6 +2163,9 @@ def int_aarch64_sve_ld1_gather_uxtw : AdvSIMD_GatherLoad_SV_32b_Offsets_Intrinsi
 def int_aarch64_sve_ld1_gather_sxtw_index : AdvSIMD_GatherLoad_SV_32b_Offsets_Intrinsic;
 def int_aarch64_sve_ld1_gather_uxtw_index : AdvSIMD_GatherLoad_SV_32b_Offsets_Intrinsic;
 
+// 128-bit loads, scaled offsets (indices)
+def int_aarch64_sve_ld1q_gather_index : AdvSIMD_GatherLoadQ_SV_Intrinsic;
+
 //
 // Gather loads: vector base + scalar offset
 //
@@ -2222,6 +2244,9 @@ def int_aarch64_sve_st1_scatter_sxtw_index
 def int_aarch64_sve_st1_scatter_uxtw_index
     : AdvSIMD_ScatterStore_SV_32b_Offsets_Intrinsic;
 
+// 128-bit stores, scaled offsets (indices)
+def int_aarch64_sve_st1q_scatter_index : AdvSIMD_ScatterStoreQ_SV_Intrinsic;
+
 //
 // Scatter stores: vector base + scalar offset
 //
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index f744643a9d9b388..8d37d91013b8641 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2579,6 +2579,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
     MAKE_CASE(AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO)
     MAKE_CASE(AArch64ISD::GLD1_IMM_MERGE_ZERO)
     MAKE_CASE(AArch64ISD::GLD1Q_MERGE_ZERO)
+    MAKE_CASE(AArch64ISD::GLD1Q_INDEX_MERGE_ZERO)
     MAKE_CASE(AArch64ISD::GLD1S_MERGE_ZERO)
     MAKE_CASE(AArch64ISD::GLD1S_SCALED_MERGE_ZERO)
     MAKE_CASE(AArch64ISD::GLD1S_SXTW_MERGE_ZERO)
@@ -2604,6 +2605,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
     MAKE_CASE(AArch64ISD::GLDNT1_INDEX_MERGE_ZERO)
     MAKE_CASE(AArch64ISD::GLDNT1S_MERGE_ZERO)
     MAKE_CASE(AArch64ISD::SST1Q_PRED)
+    MAKE_CASE(AArch64ISD::SST1Q_INDEX_PRED)
     MAKE_CASE(AArch64ISD::ST1_PRED)
     MAKE_CASE(AArch64ISD::SST1_PRED)
     MAKE_CASE(AArch64ISD::SST1_SCALED_PRED)
@@ -22761,10 +22763,11 @@ static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 
   // For FPs, ACLE only supports _packed_ single and double precision types.
-  // SST1Q_PRED is the ST1Q for sve2p1 and should allow all sizes
+  // SST1Q_[INDEX_]PRED is the ST1Q for sve2p1 and should allow all sizes.
   if (SrcElVT.isFloatingPoint())
     if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64) &&
-        (Opcode != AArch64ISD::SST1Q_PRED ||
+        ((Opcode != AArch64ISD::SST1Q_PRED &&
+          Opcode != AArch64ISD::SST1Q_INDEX_PRED) ||
          ((SrcVT != MVT::nxv8f16) && (SrcVT != MVT::nxv8bf16))))
       return SDValue();
 
@@ -22782,6 +22785,10 @@ static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG,
     Offset =
         getScaledOffsetForBitWidth(DAG, Offset, DL, SrcElVT.getSizeInBits());
     Opcode = AArch64ISD::SSTNT1_PRED;
+  } else if (Opcode == AArch64ISD::SST1Q_INDEX_PRED) {
+    Offset =
+        getScaledOffsetForBitWidth(DAG, Offset, DL, SrcElVT.getSizeInBits());
+    Opcode = AArch64ISD::SST1Q_PRED;
   }
 
   // In the case of non-temporal gather loads there's only one SVE instruction
@@ -22789,7 +22796,8 @@ static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG,
   //    * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
   // Since we do have intrinsics that allow the arguments to be in a different
   // order, we may need to swap them to match the spec.
-  if (Opcode == AArch64ISD::SSTNT1_PRED && Offset.getValueType().isVector())
+  if ((Opcode == AArch64ISD::SSTNT1_PRED || Opcode == AArch64ISD::SST1Q_PRED) &&
+      Offset.getValueType().isVector())
     std::swap(Base, Offset);
 
   // SST1_IMM requires that the offset is an immediate that is:
@@ -22872,21 +22880,26 @@ static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG,
   // vector of offsets  (that fits into one register)
   SDValue Offset = N->getOperand(4);
 
-  // For "scalar + vector of indices", just scale the indices. This only
-  // applies to non-temporal gathers because there's no instruction that takes
-  // indicies.
+  // For "scalar + vector of indices", scale the indices to obtain unscaled
+  // offsets. This applies to non-temporal and quadword gathers, which do not
+  // have an addressing mode with scaled offset.
   if (Opcode == AArch64ISD::GLDNT1_INDEX_MERGE_ZERO) {
     Offset = getScaledOffsetForBitWidth(DAG, Offset, DL,
                                         RetVT.getScalarSizeInBits());
     Opcode = AArch64ISD::GLDNT1_MERGE_ZERO;
+  } else if (Opcode == AArch64ISD::GLD1Q_INDEX_MERGE_ZERO) {
+    Offset = getScaledOffsetForBitWidth(DAG, Offset, DL,
+                                        RetVT.getScalarSizeInBits());
+    Opcode = AArch64ISD::GLD1Q_MERGE_ZERO;
   }
 
-  // In the case of non-temporal gather loads there's only one SVE instruction
-  // per data-size: "scalar + vector", i.e.
-  //    * ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
+  // In the case of non-temporal gather loads and quadword gather loads there's
+  // only one addressing mode : "vector + scalar", e.g.
+  //   ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
   // Since we do have intrinsics that allow the arguments to be in a different
   // order, we may need to swap them to match the spec.
-  if (Opcode == AArch64ISD::GLDNT1_MERGE_ZERO &&
+  if ((Opcode == AArch64ISD::GLDNT1_MERGE_ZERO ||
+       Opcode == AArch64ISD::GLD1Q_MERGE_ZERO) &&
       Offset.getValueType().isVector())
     std::swap(Base, Offset);
 
@@ -23736,6 +23749,9 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
       return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_MERGE_ZERO);
     case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
       return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1Q_MERGE_ZERO);
+    case Intrinsic::aarch64_sve_ld1q_gather_index:
+      return performGatherLoadCombine(N, DAG,
+                                      AArch64ISD::GLD1Q_INDEX_MERGE_ZERO);
     case Intrinsic::aarch64_sve_ld1_gather_index:
       return performGatherLoadCombine(N, DAG,
                                       AArch64ISD::GLD1_SCALED_MERGE_ZERO);
@@ -23781,6 +23797,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
                                       AArch64ISD::GLDFF1_IMM_MERGE_ZERO);
     case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
       return performScatterStoreCombine(N, DAG, AArch64ISD::SST1Q_PRED);
+    case Intrinsic::aarch64_sve_st1q_scatter_index:
+      return performScatterStoreCombine(N, DAG, AArch64ISD::SST1Q_INDEX_PRED);
     case Intrinsic::aarch64_sve_st1_scatter:
       return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_PRED);
     case Intrinsic::aarch64_sve_st1_scatter_index:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index b4e89297ad58780..169b0dbab65cdca 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -376,6 +376,7 @@ enum NodeType : unsigned {
   GLD1_SXTW_SCALED_MERGE_ZERO,
   GLD1_IMM_MERGE_ZERO,
   GLD1Q_MERGE_ZERO,
+  GLD1Q_INDEX_MERGE_ZERO,
 
   // Signed gather loads
   GLD1S_MERGE_ZERO,
@@ -421,6 +422,7 @@ enum NodeType : unsigned {
   SST1_SXTW_SCALED_PRED,
   SST1_IMM_PRED,
   SST1Q_PRED,
+  SST1Q_INDEX_PRED,
 
   // Non-temporal scatter store
   SSTNT1_PRED,
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-gather-loads-128bit-index.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-gather-loads-128bit-index.ll
new file mode 100644
index 000000000000000..4ad13ee97f010e2
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-gather-loads-128bit-index.ll
@@ -0,0 +1,249 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1,+bf16 < %s | FileCheck %s
+
+declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv2i64.nxv2i64(<vscale x 1 x i1>, <vscale x 2 x i64>, i64)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv4i32.nxv2i64(<vscale x 1 x i1>, <vscale x 2 x i64>, i64)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8i16.nxv2i64(<vscale x 1 x i1>, <vscale x 2 x i64>, i64)
+declare <vscale x 16 x i8> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv16i8.nxv2i64(<vscale x 1 x i1>, <vscale x 2 x i64>, i64)
+declare <vscale x 2 x double> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv2f64.nxv2i64(<vscale x 1 x i1>, <vscale x 2 x i64>, i64)
+declare <vscale x 4 x float> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv4f32.nxv2i64(<vscale x 1 x i1>, <vscale x 2 x i64>, i64)
+declare <vscale x 8 x half> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8f16.nxv2i64(<vscale x 1 x i1>, <vscale x 2 x i64>, i64)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8bf16.nxv2i64(<vscale x 1 x i1>, <vscale x 2 x i64>, i64)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1q.gather.index.nxv4i32(<vscale x 1 x i1>, ptr, <vscale x 2 x i64>)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.ld1q.gather.index.nxv8i16(<vscale x 1 x i1>, ptr, <vscale x 2 x i64>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1q.gather.index.nxv2i64(<vscale x 1 x i1>, ptr, <vscale x 2 x i64>)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1q.gather.index.nxv8bf16(<vscale x 1 x i1>, ptr, <vscale x 2 x i64>)
+declare <vscale x 8 x half> @llvm.aarch64.sve.ld1q.gather.index.nxv8f16(<vscale x 1 x i1>, ptr, <vscale x 2 x i64>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.ld1q.gather.index.nxv4f32(<vscale x 1 x i1>, ptr, <vscale x 2 x i64>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.ld1q.gather.index.nxv2f64(<vscale x 1 x i1>, ptr, <vscale x 2 x i64>)
+
+define <vscale x 8 x i16> @test_svld1q_gather_u64index_s16(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %idx) {
+; CHECK-LABEL: test_svld1q_gather_u64index_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl z0.d, z0.d, #1
+; CHECK-NEXT:    ld1q { z0.q }, p0/z, [z0.d, x0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.ld1q.gather.index.nxv8i16(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %idx)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 8 x i16> @test_svld1q_gather_u64index_u16(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %idx) {
+; CHECK-LABEL: test_svld1q_gather_u64index_u16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl z0.d, z0.d, #1
+; CHECK-NEXT:    ld1q { z0.q }, p0/z, [z0.d, x0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.ld1q.gather.index.nxv8i16(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %idx)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 4 x i32> @test_svld1q_gather_u64index_s32(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %idx) {
+; CHECK-LABEL: test_svld1q_gather_u64index_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl z0.d, z0.d, #2
+; CHECK-NEXT:    ld1q { z0.q }, p0/z, [z0.d, x0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1q.gather.index.nxv4i32(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %idx)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_svld1q_gather_u64index_u32(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %idx) {
+; CHECK-LABEL: test_svld1q_gather_u64index_u32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl z0.d, z0.d, #2
+; CHECK-NEXT:    ld1q { z0.q }, p0/z, [z0.d, x0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1q.gather.index.nxv4i32(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %idx)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 2 x i64> @test_svld1q_gather_u64index_s64(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %idx) {
+; CHECK-LABEL: test_svld1q_gather_u64index_s64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl z0.d, z0.d, #3
+; CHECK-NEXT:    ld1q { z0.q }, p0/z, [z0.d, x0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1q.gather.index.nxv2i64(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %idx)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svld1q_gather_u64index_u64(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %idx) {
+; CHECK-LABEL: test_svld1q_gather_u64index_u64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl z0.d, z0.d, #3
+; CHECK-NEXT:    ld1q { z0.q }, p0/z, [z0.d, x0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1q.gather.index.nxv2i64(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %idx)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 8 x bfloat> @test_svld1q_gather_u64index_bf16(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %idx) {
+; CHECK-LABEL: test_svld1q_gather_u64index_bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl z0.d, z0.d, #1
+; CHECK-NEXT:    ld1q { z0.q }, p0/z, [z0.d, x0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1q.gather.index.nxv8bf16(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %idx)
+  ret <vscale x 8 x bfloat> %0
+}
+
+define <vscale x 8 x half> @test_svld1q_gather_u64index_f16(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %idx) {
+; CHECK-LABEL: test_svld1q_gather_u64index_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl z0.d, z0.d, #1
+; CHECK-NEXT:    ld1q { z0.q }, p0/z, [z0.d, x0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.ld1q.gather.index.nxv8f16(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %idx)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 4 x float> @test_svld1q_gather_u64index_f32(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %idx) {
+; CHECK-LABEL: test_svld1q_gather_u64index_f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl z0.d, z0.d, #2
+; CHECK-NEXT:    ld1q { z0.q }, p0/z, [z0.d, x0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.ld1q.gather.index.nxv4f32(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %idx)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 2 x double> @test_svld1q_gather_u64index_f64(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %idx) {
+; CHECK-LABEL: test_svld1q_gather_u64index_f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl z0.d, z0.d, #3
+; CHECK-NEXT:    ld1q { z0.q }, p0/z, [z0.d, x0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.ld1q.gather.index.nxv2f64(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %idx)
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 8 x i16> @test_svld1q_gather_u64base_index_s16(<vscale x 1 x i1> %pg, <vscale x 2 x i64> %base, i64 %idx) {
+; CHECK-LABEL: test_svld1q_gather_u64base_index_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl x8, x0, #1
+; CHECK-NEXT:    ld1q { z0.q }, p0/z, [z0.d, x8]
+; CHECK-NEXT:    ret
+entry:
+  %0 = shl i64 %idx, 1
+  %1 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8i16.nxv2i64(<vscale x 1 x i1> %pg, <vscale x 2 x i64> %base, i64 %0)
+  ret <vscale x 8 x i16> %1
+}
+
+define <vscale x 8 x i16> @test_svld1q_gather_u64base_index_u16(<vscale x 1 x i1> %pg, <vscale x 2 x i64> %base, i64 %idx) {
+; CHECK-LABEL: test_svld1q_gather_u64base_index_u16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl x8, x0, #1
+; CHECK-NEXT:    ld1q { z0.q }, p0/z, [z0.d, x8]
+; CHECK-NEXT:    ret
+entry:
+  %0 = shl i64 %idx, 1
+  %1 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8i16.nxv2i64(<vscale x 1 x i1> %pg, <vscale x 2 x i64> %base, i64 %0)
+  ret <vscale x 8 x i16> %1
+}
+
+define <vscale x 4 x i32> @test_svld1q_gather_u64base_index_s32(<vscale x 1 x i1> %pg, <vscale x 2 x i64> %base, i64 %idx) {
+; CHECK-LABEL: test_svld1q_gather_u64base_index_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl x8, x0, #2
+; CHECK-NEXT:    ld1q { z0.q }, p0/z, [z0.d, x8]
+; CHECK-NEXT:    ret
+entry:
+  %0 = shl i64 %idx, 2
+  %1 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv4i32.nxv2i64(<vscale x 1 x i1> %pg, <vscale x 2 x i64> %base, i64 %0)
+  ret <vscale x 4 x i32> %1
+}
+
+define <vscale x 4 x i32> @test_svld1q_gather_u64base_index_u32(<vscale x 1 x i1> %pg, <vscale x 2 x i64> %base, i64 %idx) {
+; CHECK-LABEL: test_svld1q_gather_u64base_index_u32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl x8, x0, #2
+; CHECK-NEXT:    ld1q { z0.q }, p0/z, [z0.d, x8]
+; CHECK-NEXT:    ret
+entry:
+  %0 = shl i64 %idx, 2
+  %1 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv4i32.nxv2i64(<vscale x 1 x i1> %pg, <vscale x 2 x i64> %base, i64 %0)
+  ret <vscale x 4 x i32> %1
+}
+
+define <vscale x 2 x i64> @test_svld1q_gather_u64base_index_s64(<vscale x 1 x i1> %pg, <vscale x 2 x i64> %base, i64 %idx) {
+; CHECK-LABEL: test_svld1q_gather_u64base_index_s64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl x8, x0, #3
+; CHECK-NEXT:    ld1q { z0.q }, p0/z, [z0.d, x8]
+; CHECK-NEXT:    ret
+entry:
+  %0 = shl i64 %idx, 3
+  %1 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv2i64.nxv2i64(<vscale x 1 x i1> %pg, <vscale x 2 x i64> %base, i64 %0)
+  ret <vscale x 2 x i64> %1
+}
+
+define <vscale x 2 x i64> @test_svld1q_gather_u64base_index_u64(<vscale x 1 x i1> %pg, <vscale x 2 x i64> %base, i64 %idx) {
+; CHECK-LABEL: test_svld1q_gather_u64base_index_u64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl x8, x0, #3
+; CHECK-NEXT:    ld1q { z0.q }, p0/z, [z0.d, x8]
+; CHECK-NEXT:    ret
+entry:
+  %0 = shl i64 %idx, 3
+  %1 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv2i64.nxv2i64(<vscale x 1 x i1> %pg, <vscale x 2 x i64> %base, i64 %0)
+  ret <vscale x 2 x i64> %1
+}
+
+define <vscale x 8 x bfloat> @test_svld1q_gather_u64base_index_bf16(<vscale x 1 x i1> %pg, <vscale x 2 x i64> %base, i64 %idx) {
+; CHECK-LABEL: test_svld1q_gather_u64base_index_bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl x8, x0, #1
+; CHECK-NEXT:    ld1q { z0.q }, p0/z, [z0.d, x8]
+; CHECK-NEXT:    ret
+entry:
+  %0 = shl i64 %idx, 1
+  %1 = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8bf16.nxv2i64(<vscale x 1 x i1> %pg, <vscale x 2 x i64> %base, i64 %0)
+  ret <vscale x 8 x bfloat> %1
+}
+
+define <vscale x 8 x half> @test_svld1q_gather_u64base_index_f16(<vscale x 1 x i1> %pg, <vscale x 2 x i64> %base, i64 %idx) {
+; CHECK-LABEL: test_svld1q_gather_u64base_index_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl x8, x0, #1
+; CHECK-NEXT:    ld1q { z0.q }, p0/z, [z0.d, x8]
+; CHECK-NEXT:    ret
+entry:
+  %0 = shl i64 %idx, 1
+  %1 = tail call <vscale x 8 x half> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8f16.nxv2i64(<vscale x 1 x i1> %pg, <vscale x 2 x i64> %base, i64 %0)
+  ret <vscale x 8 x half> %1
+}
+
+define <vscale x 4 x float> @test_svld1q_gather_u64base_index_f32(<vscale x 1 x i1> %pg, <vscale x 2 x i64> %base, i64 %idx) {
+; CHECK-LABEL: test_svld1q_gather_u64base_index_f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl x8, x0, #2
+; CHECK-NEXT:    ld1q { z0.q }, p0/z, [z0.d, x8]
+; CHECK-NEXT:    ret
+entry:
+  %0 = shl i64 %idx, 2
+  %1 = tail call <vscale x 4 x float> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv4f32.nxv2i64(<vscale x 1 x i1> %pg, <vscale x 2 x i64> %base, i64 %0)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @test_svld1q_gather_u64base_index_f64(<vscale x 1 x i1> %pg, <vscale x 2 x i64> %base, i64 %idx) {
+; CHECK-LABEL: test_svld1q_gather_u64base_index_f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl x8, x0, #3
+; CHECK-NEXT:    ld1q { z0.q }, p0/z, [z0.d, x8]
+; CHECK-NEXT:    ret
+entry:
+  %0 = shl i64 %idx, 3
+  %1 = tail call <vscale x 2 x double> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv2f64.nxv2i64(<vscale x 1 x i1> %pg, <vscale x 2 x i64> %base, i64 %0)
+  ret <vscale x 2 x double> %1
+}
+
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-scatter-stores-128bit-index.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-scatter-stores-128bit-index.ll
new file mode 100644
index 000000000000000..29c9c6a23235e15
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-scatter-stores-128bit-index.ll
@@ -0,0 +1,248 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1,+bf16 < %s | FileCheck %s
+
+declare void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i64>, <vscale x 1 x i1>, <vscale x 2 x i64>, i64)
+declare void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv4i32.nxv2i64(<vscale x 4 x i32>, <vscale x 1 x i1>, <vscale x 2 x i64>, i64)
+declare void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8i16.nxv2i64(<vscale x 8 x i16>, <vscale x 1 x i1>, <vscale x 2 x i64>, i64)
+declare void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv16i8.nxv2i64(<vscale x 16 x i8>, <vscale x 1 x i1>, <vscale x 2 x i64>, i64)
+declare void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2f64.nxv2i64(<vscale x 2 x double>, <vscale x 1 x i1>, <vscale x 2 x i64>, i64)
+declare void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv4f32.nxv2i64(<vscale x 4 x float>, <vscale x 1 x i1>, <vscale x 2 x i64>, i64)
+declare void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8f16.nxv2i64(<vscale x 8 x half>, <vscale x 1 x i1>, <vscale x 2 x i64>, i64)
+declare void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8bf16.nxv2i64(<vscale x 8 x bfloat>, <vscale x 1 x i1>, <vscale x 2 x i64>, i64)
+declare void @llvm.aarch64.sve.st1q.scatter.index.nxv8i16(<vscale x 8 x i16>, <vscale x 1 x i1>, ptr, <vscale x 2 x i64>)
+declare void @llvm.aarch64.sve.st1q.scatter.index.nxv4i32(<vscale x 4 x i32>, <vscale x 1 x i1>, ptr, <vscale x 2 x i64>)
+declare void @llvm.aarch64.sve.st1q.scatter.index.nxv2i64(<vscale x 2 x i64>, <vscale x 1 x i1>, ptr, <vscale x 2 x i64>)
+declare void @llvm.aarch64.sve.st1q.scatter.index.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 1 x i1>, ptr, <vscale x 2 x i64>)
+declare void @llvm.aarch64.sve.st1q.scatter.index.nxv8f16(<vscale x 8 x half>, <vscale x 1 x i1>, ptr, <vscale x 2 x i64>)
+declare void @llvm.aarch64.sve.st1q.scatter.index.nxv4f32(<vscale x 4 x float>, <vscale x 1 x i1>, ptr, <vscale x 2 x i64>)
+declare void @llvm.aarch64.sve.st1q.scatter.index.nxv2f64(<vscale x 2 x double>, <vscale x 1 x i1>, ptr, <vscale x 2 x i64>)
+
+define void @test_svst1q_scatter_u64index_s16(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %idx, <vscale x 8 x i16> %data) {
+; CHECK-LABEL: test_svst1q_scatter_u64index_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl z0.d, z0.d, #1
+; CHECK-NEXT:    st1q { z1.q }, p0, [z0.d, x0]
+; CHECK-NEXT:    ret
+entry:
+  tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv8i16(<vscale x 8 x i16> %data, <vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %idx)
+  ret void
+}
+
+define void @test_svst1q_scatter_u64index_u16(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %idx, <vscale x 8 x i16> %data) {
+; CHECK-LABEL: test_svst1q_scatter_u64index_u16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl z0.d, z0.d, #1
+; CHECK-NEXT:    st1q { z1.q }, p0, [z0.d, x0]
+; CHECK-NEXT:    ret
+entry:
+  tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv8i16(<vscale x 8 x i16> %data, <vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %idx)
+  ret void
+}
+
+define void @test_svst1q_scatter_u64index_s32(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %idx, <vscale x 4 x i32> %data) {
+; CHECK-LABEL: test_svst1q_scatter_u64index_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl z0.d, z0.d, #2
+; CHECK-NEXT:    st1q { z1.q }, p0, [z0.d, x0]
+; CHECK-NEXT:    ret
+entry:
+  tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv4i32(<vscale x 4 x i32> %data, <vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %idx)
+  ret void
+}
+
+define void @test_svst1q_scatter_u64index_u32(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %idx, <vscale x 4 x i32> %data) {
+; CHECK-LABEL: test_svst1q_scatter_u64index_u32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl z0.d, z0.d, #2
+; CHECK-NEXT:    st1q { z1.q }, p0, [z0.d, x0]
+; CHECK-NEXT:    ret
+entry:
+  tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv4i32(<vscale x 4 x i32> %data, <vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %idx)
+  ret void
+}
+
+define void @test_svst1q_scatter_u64index_s64(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %idx, <vscale x 2 x i64> %data) {
+; CHECK-LABEL: test_svst1q_scatter_u64index_s64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl z0.d, z0.d, #3
+; CHECK-NEXT:    st1q { z1.q }, p0, [z0.d, x0]
+; CHECK-NEXT:    ret
+entry:
+  tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv2i64(<vscale x 2 x i64> %data, <vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %idx)
+  ret void
+}
+
+define void @test_svst1q_scatter_u64index_u64(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %idx, <vscale x 2 x i64> %data) {
+; CHECK-LABEL: test_svst1q_scatter_u64index_u64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl z0.d, z0.d, #3
+; CHECK-NEXT:    st1q { z1.q }, p0, [z0.d, x0]
+; CHECK-NEXT:    ret
+entry:
+  tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv2i64(<vscale x 2 x i64> %data, <vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %idx)
+  ret void
+}
+
+define void @test_svst1q_scatter_u64index_bf16(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %idx, <vscale x 8 x bfloat> %data) {
+; CHECK-LABEL: test_svst1q_scatter_u64index_bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl z0.d, z0.d, #1
+; CHECK-NEXT:    st1q { z1.q }, p0, [z0.d, x0]
+; CHECK-NEXT:    ret
+entry:
+  tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv8bf16(<vscale x 8 x bfloat> %data, <vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %idx)
+  ret void
+}
+
+define void @test_svst1q_scatter_u64index_f16(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %idx, <vscale x 8 x half> %data) {
+; CHECK-LABEL: test_svst1q_scatter_u64index_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl z0.d, z0.d, #1
+; CHECK-NEXT:    st1q { z1.q }, p0, [z0.d, x0]
+; CHECK-NEXT:    ret
+entry:
+  tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv8f16(<vscale x 8 x half> %data, <vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %idx)
+  ret void
+}
+
+define void @test_svst1q_scatter_u64index_f32(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %idx, <vscale x 4 x float> %data) {
+; CHECK-LABEL: test_svst1q_scatter_u64index_f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl z0.d, z0.d, #2
+; CHECK-NEXT:    st1q { z1.q }, p0, [z0.d, x0]
+; CHECK-NEXT:    ret
+entry:
+  tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv4f32(<vscale x 4 x float> %data, <vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %idx)
+  ret void
+}
+
+define void @test_svst1q_scatter_u64index_f64(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %idx, <vscale x 2 x double> %data) {
+; CHECK-LABEL: test_svst1q_scatter_u64index_f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl z0.d, z0.d, #3
+; CHECK-NEXT:    st1q { z1.q }, p0, [z0.d, x0]
+; CHECK-NEXT:    ret
+entry:
+  tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv2f64(<vscale x 2 x double> %data, <vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %idx)
+  ret void
+}
+
+define void @test_svst1q_scatter_u64base_index_s16(<vscale x 1 x i1> %pg, <vscale x 2 x i64> %base, i64 %idx, <vscale x 8 x i16> %data) {
+; CHECK-LABEL: test_svst1q_scatter_u64base_index_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl x8, x0, #1
+; CHECK-NEXT:    st1q { z1.q }, p0, [z0.d, x8]
+; CHECK-NEXT:    ret
+entry:
+  %0 = shl i64 %idx, 1
+  tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8i16.nxv2i64(<vscale x 8 x i16> %data, <vscale x 1 x i1> %pg, <vscale x 2 x i64> %base, i64 %0)
+  ret void
+}
+
+define void @test_svst1q_scatter_u64base_index_u16(<vscale x 1 x i1> %pg, <vscale x 2 x i64> %base, i64 %idx, <vscale x 8 x i16> %data) {
+; CHECK-LABEL: test_svst1q_scatter_u64base_index_u16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl x8, x0, #1
+; CHECK-NEXT:    st1q { z1.q }, p0, [z0.d, x8]
+; CHECK-NEXT:    ret
+entry:
+  %0 = shl i64 %idx, 1
+  tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8i16.nxv2i64(<vscale x 8 x i16> %data, <vscale x 1 x i1> %pg, <vscale x 2 x i64> %base, i64 %0)
+  ret void
+}
+
+define void @test_svst1q_scatter_u64base_index_s32(<vscale x 1 x i1> %pg, <vscale x 2 x i64> %base, i64 %idx, <vscale x 4 x i32> %data) {
+; CHECK-LABEL: test_svst1q_scatter_u64base_index_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl x8, x0, #2
+; CHECK-NEXT:    st1q { z1.q }, p0, [z0.d, x8]
+; CHECK-NEXT:    ret
+entry:
+  %0 = shl i64 %idx, 2
+  tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv4i32.nxv2i64(<vscale x 4 x i32> %data, <vscale x 1 x i1> %pg, <vscale x 2 x i64> %base, i64 %0)
+  ret void
+}
+
+define void @test_svst1q_scatter_u64base_index_u32(<vscale x 1 x i1> %pg, <vscale x 2 x i64> %base, i64 %idx, <vscale x 4 x i32> %data) {
+; CHECK-LABEL: test_svst1q_scatter_u64base_index_u32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl x8, x0, #2
+; CHECK-NEXT:    st1q { z1.q }, p0, [z0.d, x8]
+; CHECK-NEXT:    ret
+entry:
+  %0 = shl i64 %idx, 2
+  tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv4i32.nxv2i64(<vscale x 4 x i32> %data, <vscale x 1 x i1> %pg, <vscale x 2 x i64> %base, i64 %0)
+  ret void
+}
+
+define void @test_svst1q_scatter_u64base_index_s64(<vscale x 1 x i1> %pg, <vscale x 2 x i64> %base, i64 %idx, <vscale x 2 x i64> %data) {
+; CHECK-LABEL: test_svst1q_scatter_u64base_index_s64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl x8, x0, #3
+; CHECK-NEXT:    st1q { z1.q }, p0, [z0.d, x8]
+; CHECK-NEXT:    ret
+entry:
+  %0 = shl i64 %idx, 3
+  tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i64> %data, <vscale x 1 x i1> %pg, <vscale x 2 x i64> %base, i64 %0)
+  ret void
+}
+
+define void @test_svst1q_scatter_u64base_index_u64(<vscale x 1 x i1> %pg, <vscale x 2 x i64> %base, i64 %idx, <vscale x 2 x i64> %data) {
+; CHECK-LABEL: test_svst1q_scatter_u64base_index_u64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl x8, x0, #3
+; CHECK-NEXT:    st1q { z1.q }, p0, [z0.d, x8]
+; CHECK-NEXT:    ret
+entry:
+  %0 = shl i64 %idx, 3
+  tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i64> %data, <vscale x 1 x i1> %pg, <vscale x 2 x i64> %base, i64 %0)
+  ret void
+}
+
+define void @test_svst1q_scatter_u64base_index_bf16(<vscale x 1 x i1> %pg, <vscale x 2 x i64> %base, i64 %idx, <vscale x 8 x bfloat> %data) {
+; CHECK-LABEL: test_svst1q_scatter_u64base_index_bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl x8, x0, #1
+; CHECK-NEXT:    st1q { z1.q }, p0, [z0.d, x8]
+; CHECK-NEXT:    ret
+entry:
+  %0 = shl i64 %idx, 1
+  tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8bf16.nxv2i64(<vscale x 8 x bfloat> %data, <vscale x 1 x i1> %pg, <vscale x 2 x i64> %base, i64 %0)
+  ret void
+}
+
+define void @test_svst1q_scatter_u64base_index_f16(<vscale x 1 x i1> %pg, <vscale x 2 x i64> %base, i64 %idx, <vscale x 8 x half> %data) {
+; CHECK-LABEL: test_svst1q_scatter_u64base_index_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl x8, x0, #1
+; CHECK-NEXT:    st1q { z1.q }, p0, [z0.d, x8]
+; CHECK-NEXT:    ret
+entry:
+  %0 = shl i64 %idx, 1
+  tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8f16.nxv2i64(<vscale x 8 x half> %data, <vscale x 1 x i1> %pg, <vscale x 2 x i64> %base, i64 %0)
+  ret void
+}
+
+define void @test_svst1q_scatter_u64base_index_f32(<vscale x 1 x i1> %pg, <vscale x 2 x i64> %base, i64 %idx, <vscale x 4 x float> %data) {
+; CHECK-LABEL: test_svst1q_scatter_u64base_index_f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl x8, x0, #2
+; CHECK-NEXT:    st1q { z1.q }, p0, [z0.d, x8]
+; CHECK-NEXT:    ret
+entry:
+  %0 = shl i64 %idx, 2
+  tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv4f32.nxv2i64(<vscale x 4 x float> %data, <vscale x 1 x i1> %pg, <vscale x 2 x i64> %base, i64 %0)
+  ret void
+}
+
+define void @test_svst1q_scatter_u64base_index_f64(<vscale x 1 x i1> %pg, <vscale x 2 x i64> %base, i64 %idx, <vscale x 2 x double> %data) {
+; CHECK-LABEL: test_svst1q_scatter_u64base_index_f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl x8, x0, #3
+; CHECK-NEXT:    st1q { z1.q }, p0, [z0.d, x8]
+; CHECK-NEXT:    ret
+entry:
+  %0 = shl i64 %idx, 3
+  tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2f64.nxv2i64(<vscale x 2 x double> %data, <vscale x 1 x i1> %pg, <vscale x 2 x i64> %base, i64 %0)
+  ret void
+}

>From c2fa16434e3453da082104ebe5b55367286d471d Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Fri, 10 Nov 2023 15:34:12 +0000
Subject: [PATCH 2/4] Fixup patch

---
 clang/include/clang/Basic/arm_sve.td | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index ee46a81a942c3f3..a1ac926ab9577bb 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -321,10 +321,10 @@ let TargetGuard = "sve2p1" in {
   defm SVLD4Q_VNUM : StructLoad<"svld4q_vnum[_{2}]", "4Pcl", "aarch64_sve_ld4q_sret">;
 
   // Load quadwords (scalar base + vector index)
-  def SVLD1Q_GATHER_INDICES_U : MInst<"svld1q_gather_[{3}]index[_{0}]",    "dPcg", "sUsiUilUlbhfd", [IsGatherLoad], MemEltTyDefault, "aarch64_sve_ld1q_gather_index">;
+  def SVLD1Q_GATHER_INDICES_U : MInst<"svld1q_gather_[{3}]index[_{d}]", "dPcg", "sUsiUilUlbhfd", [IsGatherLoad], MemEltTyDefault, "aarch64_sve_ld1q_gather_index">;
 
   // Load quadwords (vector base + scalar index)
-  def SVLD1Q_GATHER_INDEX_S   : MInst<"svld1q_gather[_{2}base]_index_{0}", "dPgl", "sUsiUilUlbhfd", [IsGatherLoad], MemEltTyDefault, "aarch64_sve_ld1q_gather_scalar_offset">;
+  def SVLD1Q_GATHER_INDEX_S   : MInst<"svld1q_gather[_{2}base]_index_{d}", "dPgl", "sUsiUilUlbhfd", [IsGatherLoad], MemEltTyDefault, "aarch64_sve_ld1q_gather_scalar_offset">;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -472,10 +472,10 @@ let TargetGuard = "sve2p1" in {
   defm SVST4Q_VNUM : StructStore<"svst4q_vnum[_{d}]", "vPcl4", "aarch64_sve_st4q">;
 
   // Scatter store quadwords (scalar base + vector index)
-  def SVST1Q_SCATTER_INDICES_U : MInst<"svst1q_scatter_[{3}]index[_{0}]",    "vPpgd", "sUsiUilUlbhfd", [IsScatterStore], MemEltTyDefault, "aarch64_sve_st1q_scatter_index">;
+  def SVST1Q_SCATTER_INDICES_U : MInst<"svst1q_scatter_[{3}]index[_{d}]", "vPpgd", "sUsiUilUlbhfd", [IsScatterStore], MemEltTyDefault, "aarch64_sve_st1q_scatter_index">;
 
   // Scatter store quadwords (vector base + scalar index)
-  def SVST1Q_SCATTER_INDEX_S   : MInst<"svst1q_scatter[_{2}base]_index[_{0}]", "vPgld", "sUsiUilUlbhfd", [IsScatterStore], MemEltTyDefault, "aarch64_sve_st1q_scatter_scalar_offset">;
+  def SVST1Q_SCATTER_INDEX_S   : MInst<"svst1q_scatter[_{2}base]_index[_{d}]", "vPgld", "sUsiUilUlbhfd", [IsScatterStore], MemEltTyDefault, "aarch64_sve_st1q_scatter_scalar_offset">;
 }
 
 ////////////////////////////////////////////////////////////////////////////////

>From 5a3320daaba63d18dc26e7d16f7bb86cb7dc571b Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Sat, 4 Nov 2023 12:02:06 +0000
Subject: [PATCH 3/4] [AArch64] Add quadword gather load/scatter store
 intrinsics with unscaled vector offset

This patch add instrinsics of the form

  sv<type>_t svld1q_gather_u64offset_<typ>(svbool_t pg, const <type>_t *base, svuint64_t offs);
  void svst1q_scatter_u64offset_<typ>(sbvool_t, <type>_t *base, svuint64_t offst, sv<type>_t data);

as well as their short forms.

ACLE spec: ARM-software/acle#257
---
 clang/include/clang/Basic/arm_sve.td          |   6 +
 .../acle_sve2p1_loads.c                       | 192 ++++++++++++++++++
 .../acle_sve2p1_store.c                       | 192 ++++++++++++++++++
 llvm/include/llvm/IR/IntrinsicsAArch64.td     |   6 +
 .../Target/AArch64/AArch64ISelLowering.cpp    |   2 +
 ...ics-gather-loads-128bit-unscaled-offset.ll | 129 +++++++++++-
 ...s-scatter-stores-128bit-unscaled-offset.ll | 129 +++++++++++-
 7 files changed, 654 insertions(+), 2 deletions(-)

diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index a1ac926ab9577bb..c32270dc4c85256 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -310,6 +310,9 @@ let TargetGuard = "sve2p1" in {
   def SVLD1Q_GATHER_U64BASE_OFFSET : MInst<"svld1q_gather[_{2}base]_offset_{d}", "dPgl", "cUcsUsiUilUlfhdb", [IsGatherLoad, IsByteIndexed], MemEltTyDefault, "aarch64_sve_ld1q_gather_scalar_offset">;
   def SVLD1Q_GATHER_U64BASE : MInst<"svld1q_gather[_{2}base]_{d}", "dPg", "cUcsUsiUilUlfhdb", [IsGatherLoad, IsByteIndexed], MemEltTyDefault, "aarch64_sve_ld1q_gather_scalar_offset">;
 
+  // Load one vector (scalar base + vector offset)
+  def SVLD1Q_GATHER_U64OFFSET : MInst<"svld1q_gather_[{3}]offset[_{0}]", "dPcg", "cUcsUsiUilUlfhdb", [IsGatherLoad, IsByteIndexed], MemEltTyDefault, "aarch64_sve_ld1q_gather_vector_offset">;
+
   // Load N-element structure into N vectors (scalar base)
   defm SVLD2Q : StructLoad<"svld2q[_{2}]", "2Pc", "aarch64_sve_ld2q_sret">;
   defm SVLD3Q : StructLoad<"svld3q[_{2}]", "3Pc", "aarch64_sve_ld3q_sret">;
@@ -461,6 +464,9 @@ let TargetGuard = "sve2p1" in {
   def SVST1Q_SCATTER_U64BASE_OFFSET : MInst<"svst1q_scatter[_{2}base]_offset[_{d}]",  "vPgld", "cUcsUsiUilUlfhdb", [IsScatterStore, IsByteIndexed], MemEltTyDefault, "aarch64_sve_st1q_scatter_scalar_offset">;
   def SVST1Q_SCATTER_U64BASE : MInst<"svst1q_scatter[_{2}base][_{d}]",  "vPgd", "cUcsUsiUilUlfhdb", [IsScatterStore, IsByteIndexed], MemEltTyDefault, "aarch64_sve_st1q_scatter_scalar_offset">;
 
+  // Store one vector (scalar base + vector offset)
+  def SVST1Q_SCATTER_U64OFFSET : MInst<"svst1q_scatter_[{3}]offset[_{0}]", "vPpgd", "cUcsUsiUilUlfhdb", [IsScatterStore, IsByteIndexed], MemEltTyDefault, "aarch64_sve_st1q_scatter_vector_offset">;
+
   // Store N vectors into N-element structure (scalar base)
   defm SVST2Q : StructStore<"svst2q[_{d}]", "vPc2", "aarch64_sve_st2q">;
   defm SVST3Q : StructStore<"svst3q[_{d}]", "vPc3", "aarch64_sve_st3q">;
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_loads.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_loads.c
index 44351347e4cf0ae..ae3ddd416f7ee7d 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_loads.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_loads.c
@@ -2868,3 +2868,195 @@ svfloat32_t test_svld1q_gather_u64base_index_f32(svbool_t pg, svuint64_t base, i
 svfloat64_t test_svld1q_gather_u64base_index_f64(svbool_t pg, svuint64_t base, int64_t idx) {
   return SVE_ACLE_FUNC(svld1q_gather,_u64base,_index_f64,)(pg, base, idx);
 }
+
+// CHECK-LABEL: @test_svdl1q_gather_u64offset_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.ld1q.gather.vector.offset.nxv16i8(<vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z31test_svdl1q_gather_u64offset_s8u10__SVBool_tPKau12__SVUint64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.ld1q.gather.vector.offset.nxv16i8(<vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP1]]
+//
+svint8_t test_svdl1q_gather_u64offset_s8(svbool_t pg, const int8_t *base, svuint64_t off) {
+  return SVE_ACLE_FUNC(svld1q_gather_,u64,offset,_s8)(pg, base, off);
+}
+
+// CHECK-LABEL: @test_svdl1q_gather_u64offset_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.ld1q.gather.vector.offset.nxv16i8(<vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z31test_svdl1q_gather_u64offset_u8u10__SVBool_tPKhu12__SVUint64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.ld1q.gather.vector.offset.nxv16i8(<vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP1]]
+//
+svuint8_t test_svdl1q_gather_u64offset_u8(svbool_t pg, const uint8_t *base, svuint64_t off) {
+  return SVE_ACLE_FUNC(svld1q_gather_,u64,offset,_u8)(pg, base, off);
+}
+
+// CHECK-LABEL: @test_svdl1q_gather_u64offset_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.ld1q.gather.vector.offset.nxv8i16(<vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z32test_svdl1q_gather_u64offset_s16u10__SVBool_tPKsu12__SVUint64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.ld1q.gather.vector.offset.nxv8i16(<vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
+//
+svint16_t test_svdl1q_gather_u64offset_s16(svbool_t pg, const int16_t *base, svuint64_t off) {
+  return SVE_ACLE_FUNC(svld1q_gather_,u64,offset,_s16)(pg, base, off);
+}
+
+// CHECK-LABEL: @test_svdl1q_gather_u64offset_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.ld1q.gather.vector.offset.nxv8i16(<vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z32test_svdl1q_gather_u64offset_u16u10__SVBool_tPKtu12__SVUint64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.ld1q.gather.vector.offset.nxv8i16(<vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
+//
+svuint16_t test_svdl1q_gather_u64offset_u16(svbool_t pg, const uint16_t *base, svuint64_t off) {
+  return SVE_ACLE_FUNC(svld1q_gather_,u64,offset,_u16)(pg, base, off);
+}
+
+// CHECK-LABEL: @test_svdl1q_gather_u64offset_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1q.gather.vector.offset.nxv4i32(<vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z32test_svdl1q_gather_u64offset_s32u10__SVBool_tPKiu12__SVUint64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1q.gather.vector.offset.nxv4i32(<vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
+//
+svint32_t test_svdl1q_gather_u64offset_s32(svbool_t pg, const int32_t *base, svuint64_t off) {
+  return SVE_ACLE_FUNC(svld1q_gather_,u64,offset,_s32)(pg, base, off);
+}
+
+// CHECK-LABEL: @test_svdl1q_gather_u64offset_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1q.gather.vector.offset.nxv4i32(<vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z32test_svdl1q_gather_u64offset_u32u10__SVBool_tPKju12__SVUint64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1q.gather.vector.offset.nxv4i32(<vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
+//
+svuint32_t test_svdl1q_gather_u64offset_u32(svbool_t pg, const uint32_t *base, svuint64_t off) {
+  return SVE_ACLE_FUNC(svld1q_gather_,u64,offset,_u32)(pg, base, off);
+}
+
+// CHECK-LABEL: @test_svdl1q_gather_u64offset_s64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1q.gather.vector.offset.nxv2i64(<vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z32test_svdl1q_gather_u64offset_s64u10__SVBool_tPKlu12__SVUint64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1q.gather.vector.offset.nxv2i64(<vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
+//
+svint64_t test_svdl1q_gather_u64offset_s64(svbool_t pg, const int64_t *base, svuint64_t off) {
+  return SVE_ACLE_FUNC(svld1q_gather_,u64,offset,_s64)(pg, base, off);
+}
+
+// CHECK-LABEL: @test_svdl1q_gather_u64offset_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1q.gather.vector.offset.nxv2i64(<vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z32test_svdl1q_gather_u64offset_u64u10__SVBool_tPKmu12__SVUint64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1q.gather.vector.offset.nxv2i64(<vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
+//
+svuint64_t test_svdl1q_gather_u64offset_u64(svbool_t pg, const uint64_t *base, svuint64_t off) {
+  return SVE_ACLE_FUNC(svld1q_gather_,u64,offset,_u64)(pg, base, off);
+}
+
+// CHECK-LABEL: @test_svdl1q_gather_u64offset_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1q.gather.vector.offset.nxv8bf16(<vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z33test_svdl1q_gather_u64offset_bf16u10__SVBool_tPKu6__bf16u12__SVUint64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1q.gather.vector.offset.nxv8bf16(<vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svdl1q_gather_u64offset_bf16(svbool_t pg, const bfloat16_t *base, svuint64_t off) {
+  return SVE_ACLE_FUNC(svld1q_gather_,u64,offset,_bf16)(pg, base, off);
+}
+
+// CHECK-LABEL: @test_svdl1q_gather_u64offset_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.ld1q.gather.vector.offset.nxv8f16(<vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x half> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z32test_svdl1q_gather_u64offset_f16u10__SVBool_tPKDhu12__SVUint64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.ld1q.gather.vector.offset.nxv8f16(<vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP1]]
+//
+svfloat16_t test_svdl1q_gather_u64offset_f16(svbool_t pg, const float16_t *base, svuint64_t off) {
+  return SVE_ACLE_FUNC(svld1q_gather_,u64,offset,_f16)(pg, base, off);
+}
+
+// CHECK-LABEL: @test_svdl1q_gather_u64offset_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.ld1q.gather.vector.offset.nxv4f32(<vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z32test_svdl1q_gather_u64offset_f32u10__SVBool_tPKfu12__SVUint64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.ld1q.gather.vector.offset.nxv4f32(<vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+//
+svfloat32_t test_svdl1q_gather_u64offset_f32(svbool_t pg, const float32_t *base, svuint64_t off) {
+  return SVE_ACLE_FUNC(svld1q_gather_,u64,offset,_f32)(pg, base, off);
+}
+
+// CHECK-LABEL: @test_svdl1q_gather_u64offset_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.ld1q.gather.vector.offset.nxv2f64(<vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z32test_svdl1q_gather_u64offset_f64u10__SVBool_tPKdu12__SVUint64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.ld1q.gather.vector.offset.nxv2f64(<vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+//
+svfloat64_t test_svdl1q_gather_u64offset_f64(svbool_t pg, const float64_t *base, svuint64_t off) {
+  return SVE_ACLE_FUNC(svld1q_gather_,u64,offset,_f64)(pg, base, off);
+}
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_store.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_store.c
index 137801cc0814a10..2cbea29d3390436 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_store.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_store.c
@@ -2470,3 +2470,195 @@ void test_svst1q_scatter_u64base_index_f32(svbool_t pg, svuint64_t base, int64_t
 void test_svst1q_scatter_u64base_index_f64(svbool_t pg, svuint64_t base, int64_t idx, svfloat64_t data) {
   SVE_ACLE_FUNC(svst1q_scatter,_u64base,_index,_f64)(pg, base, idx, data);
 }
+
+// CHECK-LABEL: @test_svst1q_scatter_u64offset_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv16i8(<vscale x 16 x i8> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svst1q_scatter_u64offset_s8u10__SVBool_tPau12__SVUint64_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv16i8(<vscale x 16 x i8> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1q_scatter_u64offset_s8(svbool_t pg, int8_t *base, svuint64_t off, svint8_t data) {
+  SVE_ACLE_FUNC(svst1q_scatter_,u64,offset,_s8)(pg, base, off, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64offset_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv16i8(<vscale x 16 x i8> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svst1q_scatter_u64offset_u8u10__SVBool_tPhu12__SVUint64_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv16i8(<vscale x 16 x i8> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1q_scatter_u64offset_u8(svbool_t pg, uint8_t *base, svuint64_t off, svuint8_t data) {
+  SVE_ACLE_FUNC(svst1q_scatter_,u64,offset,_u8)(pg, base, off, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64offset_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv8i16(<vscale x 8 x i16> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z33test_svst1q_scatter_u64offset_s16u10__SVBool_tPsu12__SVUint64_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv8i16(<vscale x 8 x i16> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1q_scatter_u64offset_s16(svbool_t pg, int16_t *base, svuint64_t off, svint16_t data) {
+  SVE_ACLE_FUNC(svst1q_scatter_,u64,offset,_s16)(pg, base, off, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64offset_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv8i16(<vscale x 8 x i16> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z33test_svst1q_scatter_u64offset_u16u10__SVBool_tPtu12__SVUint64_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv8i16(<vscale x 8 x i16> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1q_scatter_u64offset_u16(svbool_t pg, uint16_t *base, svuint64_t off, svuint16_t data) {
+  SVE_ACLE_FUNC(svst1q_scatter_,u64,offset,_u16)(pg, base, off, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64offset_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv4i32(<vscale x 4 x i32> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z33test_svst1q_scatter_u64offset_s32u10__SVBool_tPiu12__SVUint64_tu11__SVInt32_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv4i32(<vscale x 4 x i32> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1q_scatter_u64offset_s32(svbool_t pg, int32_t *base, svuint64_t off, svint32_t data) {
+  SVE_ACLE_FUNC(svst1q_scatter_,u64,offset,_s32)(pg, base, off, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64offset_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv4i32(<vscale x 4 x i32> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z33test_svst1q_scatter_u64offset_u32u10__SVBool_tPju12__SVUint64_tu12__SVUint32_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv4i32(<vscale x 4 x i32> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1q_scatter_u64offset_u32(svbool_t pg, uint32_t *base, svuint64_t off, svuint32_t data) {
+  SVE_ACLE_FUNC(svst1q_scatter_,u64,offset,_u32)(pg, base, off, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64offset_s64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv2i64(<vscale x 2 x i64> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z33test_svst1q_scatter_u64offset_s64u10__SVBool_tPlu12__SVUint64_tu11__SVInt64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv2i64(<vscale x 2 x i64> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1q_scatter_u64offset_s64(svbool_t pg, int64_t *base, svuint64_t off, svint64_t data) {
+  SVE_ACLE_FUNC(svst1q_scatter_,u64,offset,_s64)(pg, base, off, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64offset_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv2i64(<vscale x 2 x i64> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z33test_svst1q_scatter_u64offset_u64u10__SVBool_tPmu12__SVUint64_tS1_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv2i64(<vscale x 2 x i64> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1q_scatter_u64offset_u64(svbool_t pg, uint64_t *base, svuint64_t off, svuint64_t data) {
+  SVE_ACLE_FUNC(svst1q_scatter_,u64,offset,_u64)(pg, base, off, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64offset_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv8bf16(<vscale x 8 x bfloat> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z34test_svst1q_scatter_u64offset_bf16u10__SVBool_tPu6__bf16u12__SVUint64_tu14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv8bf16(<vscale x 8 x bfloat> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1q_scatter_u64offset_bf16(svbool_t pg, bfloat16_t *base, svuint64_t off, svbfloat16_t data) {
+  SVE_ACLE_FUNC(svst1q_scatter_,u64,offset,_bf16)(pg, base, off, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64offset_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv8f16(<vscale x 8 x half> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z33test_svst1q_scatter_u64offset_f16u10__SVBool_tPDhu12__SVUint64_tu13__SVFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv8f16(<vscale x 8 x half> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1q_scatter_u64offset_f16(svbool_t pg, float16_t *base, svuint64_t off, svfloat16_t data) {
+  SVE_ACLE_FUNC(svst1q_scatter_,u64,offset,_f16)(pg, base, off, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64offset_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv4f32(<vscale x 4 x float> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z33test_svst1q_scatter_u64offset_f32u10__SVBool_tPfu12__SVUint64_tu13__SVFloat32_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv4f32(<vscale x 4 x float> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1q_scatter_u64offset_f32(svbool_t pg, float32_t *base, svuint64_t off, svfloat32_t data) {
+  SVE_ACLE_FUNC(svst1q_scatter_,u64,offset,_f32)(pg, base, off, data);
+}
+
+// CHECK-LABEL: @test_svst1q_scatter_u64offset_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv2f64(<vscale x 2 x double> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z33test_svst1q_scatter_u64offset_f64u10__SVBool_tPdu12__SVUint64_tu13__SVFloat64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv2f64(<vscale x 2 x double> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1q_scatter_u64offset_f64(svbool_t pg, float64_t *base, svuint64_t off, svfloat64_t data) {
+  SVE_ACLE_FUNC(svst1q_scatter_,u64,offset,_f64)(pg, base, off, data);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index a558a1eca84af6c..60a8d98f3bc0d26 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -2166,6 +2166,9 @@ def int_aarch64_sve_ld1_gather_uxtw_index : AdvSIMD_GatherLoad_SV_32b_Offsets_In
 // 128-bit loads, scaled offsets (indices)
 def int_aarch64_sve_ld1q_gather_index : AdvSIMD_GatherLoadQ_SV_Intrinsic;
 
+// 128-bit loads, unscaled offsets
+def int_aarch64_sve_ld1q_gather_vector_offset : AdvSIMD_GatherLoadQ_SV_Intrinsic;
+
 //
 // Gather loads: vector base + scalar offset
 //
@@ -2247,6 +2250,9 @@ def int_aarch64_sve_st1_scatter_uxtw_index
 // 128-bit stores, scaled offsets (indices)
 def int_aarch64_sve_st1q_scatter_index : AdvSIMD_ScatterStoreQ_SV_Intrinsic;
 
+// 128-bit stores, unscaled offsets
+def int_aarch64_sve_st1q_scatter_vector_offset : AdvSIMD_ScatterStoreQ_SV_Intrinsic;
+
 //
 // Scatter stores: vector base + scalar offset
 //
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 8d37d91013b8641..0eeeb70bb106c2a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -23748,6 +23748,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     case Intrinsic::aarch64_sve_ld1_gather:
       return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_MERGE_ZERO);
     case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
+    case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
       return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1Q_MERGE_ZERO);
     case Intrinsic::aarch64_sve_ld1q_gather_index:
       return performGatherLoadCombine(N, DAG,
@@ -23796,6 +23797,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
       return performGatherLoadCombine(N, DAG,
                                       AArch64ISD::GLDFF1_IMM_MERGE_ZERO);
     case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
+    case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
       return performScatterStoreCombine(N, DAG, AArch64ISD::SST1Q_PRED);
     case Intrinsic::aarch64_sve_st1q_scatter_index:
       return performScatterStoreCombine(N, DAG, AArch64ISD::SST1Q_INDEX_PRED);
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-gather-loads-128bit-unscaled-offset.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-gather-loads-128bit-unscaled-offset.ll
index 64f15897ebb9a75..8bee44be9f0cdbc 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-gather-loads-128bit-unscaled-offset.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-gather-loads-128bit-unscaled-offset.ll
@@ -94,6 +94,126 @@ define <vscale x 8 x bfloat> @ld1q_gather_u64base_bf16(<vscale x 1 x i1> %pg, <v
   ret <vscale x 8 x bfloat> %load
 }
 
+define <vscale x 16 x i8> @test_svdl1q_gather_u64offset_s8(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %off) {
+; CHECK-LABEL: test_svdl1q_gather_u64offset_s8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ld1q { z0.q }, p0/z, [z0.d, x0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.ld1q.gather.vector.offset.nxv16i8(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %off)
+  ret <vscale x 16 x i8> %0
+}
+
+define <vscale x 16 x i8> @test_svdl1q_gather_u64offset_u8(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %off) {
+; CHECK-LABEL: test_svdl1q_gather_u64offset_u8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ld1q { z0.q }, p0/z, [z0.d, x0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.ld1q.gather.vector.offset.nxv16i8(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %off)
+  ret <vscale x 16 x i8> %0
+}
+
+define <vscale x 8 x i16> @test_svdl1q_gather_u64offset_s16(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %off) {
+; CHECK-LABEL: test_svdl1q_gather_u64offset_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ld1q { z0.q }, p0/z, [z0.d, x0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.ld1q.gather.vector.offset.nxv8i16(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %off)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 8 x i16> @test_svdl1q_gather_u64offset_u16(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %off) {
+; CHECK-LABEL: test_svdl1q_gather_u64offset_u16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ld1q { z0.q }, p0/z, [z0.d, x0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.ld1q.gather.vector.offset.nxv8i16(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %off)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 4 x i32> @test_svdl1q_gather_u64offset_s32(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %off) {
+; CHECK-LABEL: test_svdl1q_gather_u64offset_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ld1q { z0.q }, p0/z, [z0.d, x0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1q.gather.vector.offset.nxv4i32(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %off)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_svdl1q_gather_u64offset_u32(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %off) {
+; CHECK-LABEL: test_svdl1q_gather_u64offset_u32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ld1q { z0.q }, p0/z, [z0.d, x0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1q.gather.vector.offset.nxv4i32(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %off)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 2 x i64> @test_svdl1q_gather_u64offset_s64(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %off) {
+; CHECK-LABEL: test_svdl1q_gather_u64offset_s64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ld1q { z0.q }, p0/z, [z0.d, x0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1q.gather.vector.offset.nxv2i64(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %off)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svdl1q_gather_u64offset_u64(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %off) {
+; CHECK-LABEL: test_svdl1q_gather_u64offset_u64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ld1q { z0.q }, p0/z, [z0.d, x0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1q.gather.vector.offset.nxv2i64(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %off)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 8 x bfloat> @test_svdl1q_gather_u64offset_bf16(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %off) {
+; CHECK-LABEL: test_svdl1q_gather_u64offset_bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ld1q { z0.q }, p0/z, [z0.d, x0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1q.gather.vector.offset.nxv8bf16(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %off)
+  ret <vscale x 8 x bfloat> %0
+}
+
+define <vscale x 8 x half> @test_svdl1q_gather_u64offset_f16(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %off) {
+; CHECK-LABEL: test_svdl1q_gather_u64offset_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ld1q { z0.q }, p0/z, [z0.d, x0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.ld1q.gather.vector.offset.nxv8f16(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %off)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 4 x float> @test_svdl1q_gather_u64offset_f32(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %off) {
+; CHECK-LABEL: test_svdl1q_gather_u64offset_f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ld1q { z0.q }, p0/z, [z0.d, x0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.ld1q.gather.vector.offset.nxv4f32(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %off)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 2 x double> @test_svdl1q_gather_u64offset_f64(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %off) {
+; CHECK-LABEL: test_svdl1q_gather_u64offset_f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ld1q { z0.q }, p0/z, [z0.d, x0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.ld1q.gather.vector.offset.nxv2f64(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %off)
+  ret <vscale x 2 x double> %0
+}
+
 declare <vscale x 16 x i8> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv16i8.nxv2i64(<vscale x 1 x i1>, <vscale x 2 x i64>, i64)
 declare <vscale x 8 x i16> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8i16.nxv2i64(<vscale x 1 x i1>, <vscale x 2 x i64>, i64)
 declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv4i32.nxv2i64(<vscale x 1 x i1>, <vscale x 2 x i64>, i64)
@@ -102,4 +222,11 @@ declare <vscale x 8 x half> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8f16.
 declare <vscale x 4 x float> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv4f32.nxv2i64(<vscale x 1 x i1>, <vscale x 2 x i64>, i64)
 declare <vscale x 2 x double> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv2f64.nxv2i64(<vscale x 1 x i1>, <vscale x 2 x i64>, i64)
 declare <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8bf16.nxv2i64(<vscale x 1 x i1>, <vscale x 2 x i64>, i64)
-
+declare <vscale x 16 x i8> @llvm.aarch64.sve.ld1q.gather.vector.offset.nxv16i8(<vscale x 1 x i1>, ptr, <vscale x 2 x i64>)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.ld1q.gather.vector.offset.nxv8i16(<vscale x 1 x i1>, ptr, <vscale x 2 x i64>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1q.gather.vector.offset.nxv4i32(<vscale x 1 x i1>, ptr, <vscale x 2 x i64>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1q.gather.vector.offset.nxv2i64(<vscale x 1 x i1>, ptr, <vscale x 2 x i64>)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1q.gather.vector.offset.nxv8bf16(<vscale x 1 x i1>, ptr, <vscale x 2 x i64>)
+declare <vscale x 8 x half> @llvm.aarch64.sve.ld1q.gather.vector.offset.nxv8f16(<vscale x 1 x i1>, ptr, <vscale x 2 x i64>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.ld1q.gather.vector.offset.nxv4f32(<vscale x 1 x i1>, ptr, <vscale x 2 x i64>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.ld1q.gather.vector.offset.nxv2f64(<vscale x 1 x i1>, ptr, <vscale x 2 x i64>)
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-scatter-stores-128bit-unscaled-offset.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-scatter-stores-128bit-unscaled-offset.ll
index c62df1d8d254ccb..6493640c06abdd3 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-scatter-stores-128bit-unscaled-offset.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-scatter-stores-128bit-unscaled-offset.ll
@@ -102,12 +102,139 @@ define void @sst1_scatter_u64base_offset_bf16(<vscale x 8 x bfloat> %data, <vsca
   ret void
 }
 
+define void @test_svst1q_scatter_u64offset_s8(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %off, <vscale x 16 x i8> %data) {
+; CHECK-LABEL: test_svst1q_scatter_u64offset_s8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    st1q { z1.q }, p0, [z0.d, x0]
+; CHECK-NEXT:    ret
+entry:
+  tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv16i8(<vscale x 16 x i8> %data, <vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %off)
+  ret void
+}
+
+define void @test_svst1q_scatter_u64offset_u8(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %off, <vscale x 16 x i8> %data) {
+; CHECK-LABEL: test_svst1q_scatter_u64offset_u8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    st1q { z1.q }, p0, [z0.d, x0]
+; CHECK-NEXT:    ret
+entry:
+  tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv16i8(<vscale x 16 x i8> %data, <vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %off)
+  ret void
+}
+
+define void @test_svst1q_scatter_u64offset_s16(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %off, <vscale x 8 x i16> %data) {
+; CHECK-LABEL: test_svst1q_scatter_u64offset_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    st1q { z1.q }, p0, [z0.d, x0]
+; CHECK-NEXT:    ret
+entry:
+  tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv8i16(<vscale x 8 x i16> %data, <vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %off)
+  ret void
+}
+
+define void @test_svst1q_scatter_u64offset_u16(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %off, <vscale x 8 x i16> %data) {
+; CHECK-LABEL: test_svst1q_scatter_u64offset_u16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    st1q { z1.q }, p0, [z0.d, x0]
+; CHECK-NEXT:    ret
+entry:
+  tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv8i16(<vscale x 8 x i16> %data, <vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %off)
+  ret void
+}
+
+define void @test_svst1q_scatter_u64offset_s32(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %off, <vscale x 4 x i32> %data) {
+; CHECK-LABEL: test_svst1q_scatter_u64offset_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    st1q { z1.q }, p0, [z0.d, x0]
+; CHECK-NEXT:    ret
+entry:
+  tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv4i32(<vscale x 4 x i32> %data, <vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %off)
+  ret void
+}
+
+define void @test_svst1q_scatter_u64offset_u32(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %off, <vscale x 4 x i32> %data) {
+; CHECK-LABEL: test_svst1q_scatter_u64offset_u32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    st1q { z1.q }, p0, [z0.d, x0]
+; CHECK-NEXT:    ret
+entry:
+  tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv4i32(<vscale x 4 x i32> %data, <vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %off)
+  ret void
+}
+
+define void @test_svst1q_scatter_u64offset_s64(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %off, <vscale x 2 x i64> %data) {
+; CHECK-LABEL: test_svst1q_scatter_u64offset_s64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    st1q { z1.q }, p0, [z0.d, x0]
+; CHECK-NEXT:    ret
+entry:
+  tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv2i64(<vscale x 2 x i64> %data, <vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %off)
+  ret void
+}
+
+define void @test_svst1q_scatter_u64offset_u64(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %off, <vscale x 2 x i64> %data) {
+; CHECK-LABEL: test_svst1q_scatter_u64offset_u64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    st1q { z1.q }, p0, [z0.d, x0]
+; CHECK-NEXT:    ret
+entry:
+  tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv2i64(<vscale x 2 x i64> %data, <vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %off)
+  ret void
+}
+
+define void @test_svst1q_scatter_u64offset_bf16(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %off, <vscale x 8 x bfloat> %data) {
+; CHECK-LABEL: test_svst1q_scatter_u64offset_bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    st1q { z1.q }, p0, [z0.d, x0]
+; CHECK-NEXT:    ret
+entry:
+  tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv8bf16(<vscale x 8 x bfloat> %data, <vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %off)
+  ret void
+}
+
+define void @test_svst1q_scatter_u64offset_f16(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %off, <vscale x 8 x half> %data) {
+; CHECK-LABEL: test_svst1q_scatter_u64offset_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    st1q { z1.q }, p0, [z0.d, x0]
+; CHECK-NEXT:    ret
+entry:
+  tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv8f16(<vscale x 8 x half> %data, <vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %off)
+  ret void
+}
+
+define void @test_svst1q_scatter_u64offset_f32(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %off, <vscale x 4 x float> %data) {
+; CHECK-LABEL: test_svst1q_scatter_u64offset_f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    st1q { z1.q }, p0, [z0.d, x0]
+; CHECK-NEXT:    ret
+entry:
+  tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv4f32(<vscale x 4 x float> %data, <vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %off)
+  ret void
+}
+
+define void @test_svst1q_scatter_u64offset_f64(<vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %off, <vscale x 2 x double> %data) {
+; CHECK-LABEL: test_svst1q_scatter_u64offset_f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    st1q { z1.q }, p0, [z0.d, x0]
+; CHECK-NEXT:    ret
+entry:
+  tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv2f64(<vscale x 2 x double> %data, <vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %off)
+  ret void
+}
+
 declare void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv16i8.nxv2i64(<vscale x 16 x i8>, <vscale x 1 x i1>, <vscale x 2 x i64>, i64)
 declare void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8i16.nxv2i64(<vscale x 8 x i16>, <vscale x 1 x i1>, <vscale x 2 x i64>, i64)
 declare void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv4i32.nxv2i64(<vscale x 4 x i32>, <vscale x 1 x i1>, <vscale x 2 x i64>, i64)
 declare void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i64>, <vscale x 1 x i1>, <vscale x 2 x i64>, i64)
-
 declare void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8f16.nxv2i64(<vscale x 8 x half>, <vscale x 1 x i1>, <vscale x 2 x i64>, i64)
 declare void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv4f32.nxv2i64(<vscale x 4 x float>, <vscale x 1 x i1>, <vscale x 2 x i64>, i64)
 declare void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2f64.nxv2i64(<vscale x 2 x double>, <vscale x 1 x i1>, <vscale x 2 x i64>, i64)
 declare void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8bf16.nxv2i64(<vscale x 8 x bfloat>, <vscale x 1 x i1>, <vscale x 2 x i64>, i64)
+declare void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv16i8(<vscale x 16 x i8>, <vscale x 1 x i1>, ptr, <vscale x 2 x i64>)
+declare void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv8i16(<vscale x 8 x i16>, <vscale x 1 x i1>, ptr, <vscale x 2 x i64>)
+declare void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv4i32(<vscale x 4 x i32>, <vscale x 1 x i1>, ptr, <vscale x 2 x i64>)
+declare void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv2i64(<vscale x 2 x i64>, <vscale x 1 x i1>, ptr, <vscale x 2 x i64>)
+declare void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 1 x i1>, ptr, <vscale x 2 x i64>)
+declare void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv8f16(<vscale x 8 x half>, <vscale x 1 x i1>, ptr, <vscale x 2 x i64>)
+declare void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv4f32(<vscale x 4 x float>, <vscale x 1 x i1>, ptr, <vscale x 2 x i64>)
+declare void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv2f64(<vscale x 2 x double>, <vscale x 1 x i1>, ptr, <vscale x 2 x i64>)

>From 4a1878814c1d1858ef1a27d8f15b1162b95ef2c6 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Fri, 10 Nov 2023 15:55:26 +0000
Subject: [PATCH 4/4] Fixup commit

---
 clang/include/clang/Basic/arm_sve.td | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index c32270dc4c85256..cd4c09a3ad7a81c 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -311,7 +311,7 @@ let TargetGuard = "sve2p1" in {
   def SVLD1Q_GATHER_U64BASE : MInst<"svld1q_gather[_{2}base]_{d}", "dPg", "cUcsUsiUilUlfhdb", [IsGatherLoad, IsByteIndexed], MemEltTyDefault, "aarch64_sve_ld1q_gather_scalar_offset">;
 
   // Load one vector (scalar base + vector offset)
-  def SVLD1Q_GATHER_U64OFFSET : MInst<"svld1q_gather_[{3}]offset[_{0}]", "dPcg", "cUcsUsiUilUlfhdb", [IsGatherLoad, IsByteIndexed], MemEltTyDefault, "aarch64_sve_ld1q_gather_vector_offset">;
+  def SVLD1Q_GATHER_U64OFFSET : MInst<"svld1q_gather_[{3}]offset[_{d}]", "dPcg", "cUcsUsiUilUlfhdb", [IsGatherLoad, IsByteIndexed], MemEltTyDefault, "aarch64_sve_ld1q_gather_vector_offset">;
 
   // Load N-element structure into N vectors (scalar base)
   defm SVLD2Q : StructLoad<"svld2q[_{2}]", "2Pc", "aarch64_sve_ld2q_sret">;
@@ -465,7 +465,7 @@ let TargetGuard = "sve2p1" in {
   def SVST1Q_SCATTER_U64BASE : MInst<"svst1q_scatter[_{2}base][_{d}]",  "vPgd", "cUcsUsiUilUlfhdb", [IsScatterStore, IsByteIndexed], MemEltTyDefault, "aarch64_sve_st1q_scatter_scalar_offset">;
 
   // Store one vector (scalar base + vector offset)
-  def SVST1Q_SCATTER_U64OFFSET : MInst<"svst1q_scatter_[{3}]offset[_{0}]", "vPpgd", "cUcsUsiUilUlfhdb", [IsScatterStore, IsByteIndexed], MemEltTyDefault, "aarch64_sve_st1q_scatter_vector_offset">;
+  def SVST1Q_SCATTER_U64OFFSET : MInst<"svst1q_scatter_[{3}]offset[_{d}]", "vPpgd", "cUcsUsiUilUlfhdb", [IsScatterStore, IsByteIndexed], MemEltTyDefault, "aarch64_sve_st1q_scatter_vector_offset">;
 
   // Store N vectors into N-element structure (scalar base)
   defm SVST2Q : StructStore<"svst2q[_{d}]", "vPc2", "aarch64_sve_st2q">;