[llvm] 0f2b68d - Implement IR intrinsics for gather prefetch.
Francesco Petrogalli via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 16 12:01:56 PDT 2020
Author: Francesco Petrogalli
Date: 2020-03-16T18:52:35Z
New Revision: 0f2b68d9c70eb16e94a50a06c9c111cc2858fec8
URL: https://github.com/llvm/llvm-project/commit/0f2b68d9c70eb16e94a50a06c9c111cc2858fec8
DIFF: https://github.com/llvm/llvm-project/commit/0f2b68d9c70eb16e94a50a06c9c111cc2858fec8.diff
LOG: Implement IR intrinsics for gather prefetch.
Summary:
Intrinsics and relative codegen has been implemented for the following
SVE instructions:
1. PRF<T> <prfop>, <Pg>, [<Xn|SP>, <Zm>.S, <mod>] -> 32-bit scaled offset
2. PRF<T> <prfop>, <Pg>, [<Xn|SP>, <Zm>.D, <mod>] -> 32-bit unpacked scaled offset
3. PRF<T> <prfop>, <Pg>, [<Xn|SP>, <Zm>.D] -> 64-bit scaled offset
4. PRF<T> <prfop>, <Pg>, [<Zn>.S{, #<imm>}] -> 32-bit element
5. PRF<T> <prfop>, <Pg>, [<Zn>.D{, #<imm>}] -> 64-bit element
The instructions are associated the following intrinsics, respectively:
1. void @llvm.aarch64.sve.gather.prf<T>.scaled.<mod>.nx4vi32(
i8* %base,
<vscale x 4 x i32> %offset,
<vscale x 4 x i1> %Pg,
i32 %prfop)
2. void @llvm.aarch64.sve.gather.prf<T>.scaled.<mod>.nx2vi32(
i8* %base,
<vscale x 2 x i32> %offset,
<vscale x 2 x i1> %Pg,
i32 %prfop)
3. void @llvm.aarch64.sve.gather.prf<T>.scaled.nx2vi64(
i8* %base,
<vscale x 2 x i64> %offset,
<vscale x 2 x i1> %Pg,
i32 %prfop)
4. void @llvm.aarch64.sve.gather.prf<T>.nx4vi32(
<vscale x 4 x i32> %bases,
i64 %imm,
<vscale x 4 x i1> %Pg,
i32 %prfop)
5. void @llvm.aarch64.sve.gather.prf<T>.nx2vi64(
<vscale x 2 x i64> %bases,
i64 %imm,
<vscale x 2 x i1> %Pg,
i32 %prfop)
The intrinsics are the IR counterpart of the following SVE ACLE functions:
* void svprf<T>(svbool_t pg, const void *base, svprfop op)
* void svprf<T>_vnum(svbool_t pg, const void *base, int64_t vnum, svprfop op)
* void svprf<T>_gather[_u32base](svbool_t pg, svuint32_t bases, svprfop op)
* void svprf<T>_gather[_u64base](svbool_t pg, svuint64_t bases, svprfop op)
* void svprf<T>_gather_[s32]offset(svbool_t pg, const void *base, svint32_t offsets, svprfop op)
* void svprf<T>_gather_[u32]offset(svbool_t pg, const void *base, svint32_t offsets, svprfop op)
* void svprf<T>_gather_[s64]offset(svbool_t pg, const void *base, svint64_t offsets, svprfop op)
* void svprf<T>_gather_[u64]offset(svbool_t pg, const void *base, svint64_t offsets, svprfop op)
* void svprf<T>_gather[_u32base]_offset(svbool_t pg, svuint32_t bases, int64_t offset, svprfop op)
* void svprf<T>_gather[_u64base]_offset(svbool_t pg, svuint64_t bases,int64_t offset, svprfop op)
Reviewers: andwar, sdesmalen, efriedma, rengolin
Subscribers: tschuett, hiraditya, rkruppe, psnobl, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D75580
Added:
llvm/test/CodeGen/AArch64/sve-intrinsics-gather-prefetches-scaled-offset.ll
llvm/test/CodeGen/AArch64/sve-intrinsics-gather-prefetches-vect-base-imm-offset.ll
llvm/test/CodeGen/AArch64/sve-intrinsics-gather-prefetches-vect-base-invalid-imm-offset.ll
Modified:
llvm/include/llvm/IR/IntrinsicsAArch64.td
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
llvm/lib/Target/AArch64/SVEInstrFormats.td
Removed:
################################################################################
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 3a205de4e368..a220934c5923 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -1263,6 +1263,27 @@ class AdvSIMD_ScatterStore_VS_Intrinsic
],
[IntrWriteMem, IntrArgMemOnly]>;
+
+class SVE_gather_prf_scalar_base_vector_offset_scaled
+ : Intrinsic<[],
+ [
+ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, // Predicate
+ llvm_ptr_ty, // Base address
+ llvm_anyvector_ty, // Offsets
+ llvm_i32_ty // Prfop
+ ],
+ [IntrInaccessibleMemOrArgMemOnly, NoCapture<1>, ImmArg<3>]>;
+
+class SVE_gather_prf_vector_base_scalar_offset
+ : Intrinsic<[],
+ [
+ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, // Predicate
+ llvm_anyvector_ty, // Base addresses
+ llvm_i64_ty, // Scalar offset
+ llvm_i32_ty // Prfop
+ ],
+ [IntrInaccessibleMemOrArgMemOnly, ImmArg<3>]>;
+
//
// Loads
//
@@ -1279,13 +1300,39 @@ def int_aarch64_sve_ldff1 : AdvSIMD_1Vec_PredFaultingLoad_Intrinsic;
def int_aarch64_sve_stnt1 : AdvSIMD_1Vec_PredStore_Intrinsic;
//
-// Prefetch
+// Prefetches
//
def int_aarch64_sve_prf
: Intrinsic<[], [llvm_anyvector_ty, llvm_ptr_ty, llvm_i32_ty],
[IntrArgMemOnly, ImmArg<2>]>;
+// Scalar + 32-bit scaled offset vector, zero extend, packed and
+// unpacked.
+def int_aarch64_sve_gather_prfb_scaled_uxtw : SVE_gather_prf_scalar_base_vector_offset_scaled;
+def int_aarch64_sve_gather_prfh_scaled_uxtw : SVE_gather_prf_scalar_base_vector_offset_scaled;
+def int_aarch64_sve_gather_prfw_scaled_uxtw : SVE_gather_prf_scalar_base_vector_offset_scaled;
+def int_aarch64_sve_gather_prfd_scaled_uxtw : SVE_gather_prf_scalar_base_vector_offset_scaled;
+
+// Scalar + 32-bit scaled offset vector, sign extend, packed and
+// unpacked.
+def int_aarch64_sve_gather_prfb_scaled_sxtw : SVE_gather_prf_scalar_base_vector_offset_scaled;
+def int_aarch64_sve_gather_prfw_scaled_sxtw : SVE_gather_prf_scalar_base_vector_offset_scaled;
+def int_aarch64_sve_gather_prfh_scaled_sxtw : SVE_gather_prf_scalar_base_vector_offset_scaled;
+def int_aarch64_sve_gather_prfd_scaled_sxtw : SVE_gather_prf_scalar_base_vector_offset_scaled;
+
+// Scalar + 64-bit scaled offset vector.
+def int_aarch64_sve_gather_prfb_scaled : SVE_gather_prf_scalar_base_vector_offset_scaled;
+def int_aarch64_sve_gather_prfh_scaled : SVE_gather_prf_scalar_base_vector_offset_scaled;
+def int_aarch64_sve_gather_prfw_scaled : SVE_gather_prf_scalar_base_vector_offset_scaled;
+def int_aarch64_sve_gather_prfd_scaled : SVE_gather_prf_scalar_base_vector_offset_scaled;
+
+// Vector + scalar.
+def int_aarch64_sve_gather_prfb : SVE_gather_prf_vector_base_scalar_offset;
+def int_aarch64_sve_gather_prfh : SVE_gather_prf_vector_base_scalar_offset;
+def int_aarch64_sve_gather_prfw : SVE_gather_prf_vector_base_scalar_offset;
+def int_aarch64_sve_gather_prfd : SVE_gather_prf_vector_base_scalar_offset;
+
//
// Scalar to vector operations
//
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 23df49790b5e..afe7412d0847 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -12646,6 +12646,20 @@ static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset,
return DAG.getNode(ISD::SHL, DL, MVT::nxv2i64, Offset, SplatShift);
}
+/// Check if the value of \p Offset represents a valid immediate for the SVE
+/// gather load/prefetch and scatter store instructiona with vector base and
+/// immediate offset addressing mode:
+///
+/// [<Zn>.[S|D]{, #<imm>}]
+///
+/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
+static bool isValidImmForSVEVecImmAddrMode(SDValue Offset,
+ unsigned ScalarSizeInBytes) {
+ ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Offset.getNode());
+ return OffsetConst && AArch64_AM::isValidImmForSVEVecImmAddrMode(
+ OffsetConst->getZExtValue(), ScalarSizeInBytes);
+}
+
static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG,
unsigned Opcode,
bool OnlyPackedOffsets = true) {
@@ -12697,13 +12711,9 @@ static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG,
// immediates outside that range and non-immediate scalar offsets use SST1 or
// SST1_UXTW instead.
if (Opcode == AArch64ISD::SST1_IMM) {
- uint64_t MaxIndex = 31;
- uint64_t SrcElSize = SrcElVT.getStoreSize().getKnownMinSize();
-
ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Offset.getNode());
- if (nullptr == OffsetConst ||
- OffsetConst->getZExtValue() > MaxIndex * SrcElSize ||
- OffsetConst->getZExtValue() % SrcElSize) {
+ if (!isValidImmForSVEVecImmAddrMode(Offset,
+ SrcVT.getScalarSizeInBits() / 8)) {
if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
Opcode = AArch64ISD::SST1_UXTW;
else
@@ -12763,7 +12773,6 @@ static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG,
"Gather loads are only possible for SVE vectors");
SDLoc DL(N);
- MVT RetElVT = RetVT.getVectorElementType().getSimpleVT();
// Make sure that the loaded data will fit into an SVE register
if (RetVT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock)
@@ -12780,8 +12789,8 @@ static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG,
// applies to non-temporal gathers because there's no instruction that takes
// indicies.
if (Opcode == AArch64ISD::GLDNT1_INDEX) {
- Offset =
- getScaledOffsetForBitWidth(DAG, Offset, DL, RetElVT.getSizeInBits());
+ Offset = getScaledOffsetForBitWidth(DAG, Offset, DL,
+ RetVT.getScalarSizeInBits());
Opcode = AArch64ISD::GLDNT1;
}
@@ -12800,13 +12809,8 @@ static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG,
// immediates outside that range and non-immediate scalar offsets use GLD1 or
// GLD1_UXTW instead.
if (Opcode == AArch64ISD::GLD1_IMM || Opcode == AArch64ISD::GLDFF1_IMM) {
- uint64_t MaxIndex = 31;
- uint64_t RetElSize = RetElVT.getStoreSize().getKnownMinSize();
-
- ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Offset.getNode());
- if (nullptr == OffsetConst ||
- OffsetConst->getZExtValue() > MaxIndex * RetElSize ||
- OffsetConst->getZExtValue() % RetElSize) {
+ if (!isValidImmForSVEVecImmAddrMode(Offset,
+ RetVT.getScalarSizeInBits() / 8)) {
if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
Opcode = (Opcode == AArch64ISD::GLD1_IMM) ? AArch64ISD::GLD1_UXTW
: AArch64ISD::GLDFF1_UXTW;
@@ -12950,6 +12954,51 @@ performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
return SDValue(N, 0);
}
+/// Legalize the gather prefetch (scalar + vector addressing mode) when the
+/// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset
+/// != nxv2i32) do not need legalization.
+static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG) {
+ const unsigned OffsetPos = 4;
+ SDValue Offset = N->getOperand(OffsetPos);
+
+ // Not an unpacked vector, bail out.
+ if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32)
+ return SDValue();
+
+ // Extend the unpacked offset vector to 64-bit lanes.
+ SDLoc DL(N);
+ Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset);
+ SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
+ // Replace the offset operand with the 64-bit one.
+ Ops[OffsetPos] = Offset;
+
+ return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
+}
+
+/// Combines a node carrying the intrinsic `aarch64_sve_gather_prf<T>` into a
+/// node that uses `aarch64_sve_gather_prf<T>_scaled_uxtw` when the scalar
+/// offset passed to `aarch64_sve_gather_prf<T>` is not a valid immediate for
+/// the sve gather prefetch instruction with vector plus immediate addressing
+/// mode.
+static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG,
+ unsigned NewIID,
+ unsigned ScalarSizeInBytes) {
+ const unsigned ImmPos = 4, OffsetPos = 3;
+ // No need to combine the node if the immediate is valid...
+ if (isValidImmForSVEVecImmAddrMode(N->getOperand(ImmPos), ScalarSizeInBytes))
+ return SDValue();
+
+ // ...otherwise swap the offset base with the offset...
+ SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
+ std::swap(Ops[ImmPos], Ops[OffsetPos]);
+ // ...and remap the intrinsic `aarch64_sve_gather_prf<T>` to
+ // `aarch64_sve_gather_prf<T>_scaled_uxtw`.
+ SDLoc DL(N);
+ Ops[1] = DAG.getConstant(NewIID, DL, MVT::i64);
+
+ return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
+}
+
SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -13014,6 +13063,31 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::INTRINSIC_VOID:
case ISD::INTRINSIC_W_CHAIN:
switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
+ case Intrinsic::aarch64_sve_gather_prfb:
+ return combineSVEPrefetchVecBaseImmOff(
+ N, DAG, Intrinsic::aarch64_sve_gather_prfb_scaled_uxtw,
+ 1 /*=ScalarSizeInBytes*/);
+ case Intrinsic::aarch64_sve_gather_prfh:
+ return combineSVEPrefetchVecBaseImmOff(
+ N, DAG, Intrinsic::aarch64_sve_gather_prfh_scaled_uxtw,
+ 2 /*=ScalarSizeInBytes*/);
+ case Intrinsic::aarch64_sve_gather_prfw:
+ return combineSVEPrefetchVecBaseImmOff(
+ N, DAG, Intrinsic::aarch64_sve_gather_prfw_scaled_uxtw,
+ 4 /*=ScalarSizeInBytes*/);
+ case Intrinsic::aarch64_sve_gather_prfd:
+ return combineSVEPrefetchVecBaseImmOff(
+ N, DAG, Intrinsic::aarch64_sve_gather_prfd_scaled_uxtw,
+ 8 /*=ScalarSizeInBytes*/);
+ case Intrinsic::aarch64_sve_gather_prfb_scaled_uxtw:
+ case Intrinsic::aarch64_sve_gather_prfb_scaled_sxtw:
+ case Intrinsic::aarch64_sve_gather_prfh_scaled_uxtw:
+ case Intrinsic::aarch64_sve_gather_prfh_scaled_sxtw:
+ case Intrinsic::aarch64_sve_gather_prfw_scaled_uxtw:
+ case Intrinsic::aarch64_sve_gather_prfw_scaled_sxtw:
+ case Intrinsic::aarch64_sve_gather_prfd_scaled_uxtw:
+ case Intrinsic::aarch64_sve_gather_prfd_scaled_sxtw:
+ return legalizeSVEGatherPrefetchOffsVec(N, DAG);
case Intrinsic::aarch64_neon_ld2:
case Intrinsic::aarch64_neon_ld3:
case Intrinsic::aarch64_neon_ld4:
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 833aee041aa5..a83e23832ba1 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -880,37 +880,37 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
// Gather prefetch using scaled 32-bit offsets, e.g.
// prfh pldl1keep, p0, [x0, z0.s, uxtw #1]
- defm PRFB_S : sve_mem_32b_prfm_sv_scaled<0b00, "prfb", ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only>;
- defm PRFH_S : sve_mem_32b_prfm_sv_scaled<0b01, "prfh", ZPR32ExtSXTW16, ZPR32ExtUXTW16>;
- defm PRFW_S : sve_mem_32b_prfm_sv_scaled<0b10, "prfw", ZPR32ExtSXTW32, ZPR32ExtUXTW32>;
- defm PRFD_S : sve_mem_32b_prfm_sv_scaled<0b11, "prfd", ZPR32ExtSXTW64, ZPR32ExtUXTW64>;
+ defm PRFB_S : sve_mem_32b_prfm_sv_scaled<0b00, "prfb", ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, int_aarch64_sve_gather_prfb_scaled_sxtw, int_aarch64_sve_gather_prfb_scaled_uxtw>;
+ defm PRFH_S : sve_mem_32b_prfm_sv_scaled<0b01, "prfh", ZPR32ExtSXTW16, ZPR32ExtUXTW16, int_aarch64_sve_gather_prfh_scaled_sxtw, int_aarch64_sve_gather_prfh_scaled_uxtw>;
+ defm PRFW_S : sve_mem_32b_prfm_sv_scaled<0b10, "prfw", ZPR32ExtSXTW32, ZPR32ExtUXTW32, int_aarch64_sve_gather_prfw_scaled_sxtw, int_aarch64_sve_gather_prfw_scaled_uxtw>;
+ defm PRFD_S : sve_mem_32b_prfm_sv_scaled<0b11, "prfd", ZPR32ExtSXTW64, ZPR32ExtUXTW64, int_aarch64_sve_gather_prfd_scaled_sxtw, int_aarch64_sve_gather_prfd_scaled_uxtw>;
// Gather prefetch using unpacked, scaled 32-bit offsets, e.g.
// prfh pldl1keep, p0, [x0, z0.d, uxtw #1]
- defm PRFB_D : sve_mem_64b_prfm_sv_ext_scaled<0b00, "prfb", ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only>;
- defm PRFH_D : sve_mem_64b_prfm_sv_ext_scaled<0b01, "prfh", ZPR64ExtSXTW16, ZPR64ExtUXTW16>;
- defm PRFW_D : sve_mem_64b_prfm_sv_ext_scaled<0b10, "prfw", ZPR64ExtSXTW32, ZPR64ExtUXTW32>;
- defm PRFD_D : sve_mem_64b_prfm_sv_ext_scaled<0b11, "prfd", ZPR64ExtSXTW64, ZPR64ExtUXTW64>;
+ defm PRFB_D : sve_mem_64b_prfm_sv_ext_scaled<0b00, "prfb", ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, int_aarch64_sve_gather_prfb_scaled_sxtw, int_aarch64_sve_gather_prfb_scaled_uxtw>;
+ defm PRFH_D : sve_mem_64b_prfm_sv_ext_scaled<0b01, "prfh", ZPR64ExtSXTW16, ZPR64ExtUXTW16, int_aarch64_sve_gather_prfh_scaled_sxtw, int_aarch64_sve_gather_prfh_scaled_uxtw>;
+ defm PRFW_D : sve_mem_64b_prfm_sv_ext_scaled<0b10, "prfw", ZPR64ExtSXTW32, ZPR64ExtUXTW32, int_aarch64_sve_gather_prfw_scaled_sxtw, int_aarch64_sve_gather_prfw_scaled_uxtw>;
+ defm PRFD_D : sve_mem_64b_prfm_sv_ext_scaled<0b11, "prfd", ZPR64ExtSXTW64, ZPR64ExtUXTW64, int_aarch64_sve_gather_prfd_scaled_sxtw, int_aarch64_sve_gather_prfd_scaled_uxtw>;
// Gather prefetch using scaled 64-bit offsets, e.g.
// prfh pldl1keep, p0, [x0, z0.d, lsl #1]
- defm PRFB_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b00, "prfb", ZPR64ExtLSL8>;
- defm PRFH_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b01, "prfh", ZPR64ExtLSL16>;
- defm PRFW_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b10, "prfw", ZPR64ExtLSL32>;
- defm PRFD_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b11, "prfd", ZPR64ExtLSL64>;
+ defm PRFB_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b00, "prfb", ZPR64ExtLSL8, int_aarch64_sve_gather_prfb_scaled>;
+ defm PRFH_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b01, "prfh", ZPR64ExtLSL16, int_aarch64_sve_gather_prfh_scaled>;
+ defm PRFW_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b10, "prfw", ZPR64ExtLSL32, int_aarch64_sve_gather_prfw_scaled>;
+ defm PRFD_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b11, "prfd", ZPR64ExtLSL64, int_aarch64_sve_gather_prfd_scaled>;
// Gather prefetch using 32/64-bit pointers with offset, e.g.
// prfh pldl1keep, p0, [z0.s, #16]
// prfh pldl1keep, p0, [z0.d, #16]
- defm PRFB_S_PZI : sve_mem_32b_prfm_vi<0b00, "prfb", imm0_31>;
- defm PRFH_S_PZI : sve_mem_32b_prfm_vi<0b01, "prfh", uimm5s2>;
- defm PRFW_S_PZI : sve_mem_32b_prfm_vi<0b10, "prfw", uimm5s4>;
- defm PRFD_S_PZI : sve_mem_32b_prfm_vi<0b11, "prfd", uimm5s8>;
-
- defm PRFB_D_PZI : sve_mem_64b_prfm_vi<0b00, "prfb", imm0_31>;
- defm PRFH_D_PZI : sve_mem_64b_prfm_vi<0b01, "prfh", uimm5s2>;
- defm PRFW_D_PZI : sve_mem_64b_prfm_vi<0b10, "prfw", uimm5s4>;
- defm PRFD_D_PZI : sve_mem_64b_prfm_vi<0b11, "prfd", uimm5s8>;
+ defm PRFB_S_PZI : sve_mem_32b_prfm_vi<0b00, "prfb", imm0_31, int_aarch64_sve_gather_prfb>;
+ defm PRFH_S_PZI : sve_mem_32b_prfm_vi<0b01, "prfh", uimm5s2, int_aarch64_sve_gather_prfh>;
+ defm PRFW_S_PZI : sve_mem_32b_prfm_vi<0b10, "prfw", uimm5s4, int_aarch64_sve_gather_prfw>;
+ defm PRFD_S_PZI : sve_mem_32b_prfm_vi<0b11, "prfd", uimm5s8, int_aarch64_sve_gather_prfd>;
+
+ defm PRFB_D_PZI : sve_mem_64b_prfm_vi<0b00, "prfb", imm0_31, int_aarch64_sve_gather_prfb>;
+ defm PRFH_D_PZI : sve_mem_64b_prfm_vi<0b01, "prfh", uimm5s2, int_aarch64_sve_gather_prfh>;
+ defm PRFW_D_PZI : sve_mem_64b_prfm_vi<0b10, "prfw", uimm5s4, int_aarch64_sve_gather_prfw>;
+ defm PRFD_D_PZI : sve_mem_64b_prfm_vi<0b11, "prfd", uimm5s8, int_aarch64_sve_gather_prfd>;
defm ADR_SXTW_ZZZ_D : sve_int_bin_cons_misc_0_a_sxtw<0b00, "adr">;
defm ADR_UXTW_ZZZ_D : sve_int_bin_cons_misc_0_a_uxtw<0b01, "adr">;
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
index 9814f7625853..3f631df252af 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
@@ -840,6 +840,26 @@ inline static bool isAnyMOVWMovAlias(uint64_t Value, int RegWidth) {
return isAnyMOVZMovAlias(Value, RegWidth);
}
+/// Check if the value of \p OffsetInBytes can be used as an immediate for
+/// the gather load/prefetch and scatter store instructions with vector base and
+/// immediate offset addressing mode:
+///
+/// [<Zn>.[S|D]{, #<imm>}]
+///
+/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
+static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes,
+ unsigned ScalarSizeInBytes) {
+ // The immediate is not a multiple of the scalar size.
+ if (OffsetInBytes % ScalarSizeInBytes)
+ return false;
+
+ // The immediate is out of range.
+ if (OffsetInBytes / ScalarSizeInBytes > 31)
+ return false;
+
+ return true;
+}
+
} // end namespace AArch64_AM
} // end namespace llvm
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 6a9d3acff8fb..3937d6390c4d 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -6455,9 +6455,17 @@ class sve_mem_32b_prfm_sv<bits<2> msz, bit xs, string asm,
multiclass sve_mem_32b_prfm_sv_scaled<bits<2> msz, string asm,
RegisterOperand sxtw_opnd,
- RegisterOperand uxtw_opnd> {
+ RegisterOperand uxtw_opnd,
+ PatFrag op_sxtw,
+ PatFrag op_uxtw> {
def _UXTW_SCALED : sve_mem_32b_prfm_sv<msz, 0, asm, uxtw_opnd>;
def _SXTW_SCALED : sve_mem_32b_prfm_sv<msz, 1, asm, sxtw_opnd>;
+
+ def : Pat<(op_uxtw (nxv4i1 PPR3bAny:$Pg), (i64 GPR64sp:$Rn), (nxv4i32 uxtw_opnd:$Zm), (i32 sve_prfop:$prfop)),
+ (!cast<Instruction>(NAME # _UXTW_SCALED) sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm)>;
+
+ def : Pat<(op_sxtw (nxv4i1 PPR3bAny:$Pg), (i64 GPR64sp:$Rn), (nxv4i32 sxtw_opnd:$Zm), (i32 sve_prfop:$prfop)),
+ (!cast<Instruction>(NAME # _SXTW_SCALED) sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm)>;
}
class sve_mem_32b_prfm_vi<bits<2> msz, string asm, Operand imm_ty>
@@ -6480,11 +6488,14 @@ class sve_mem_32b_prfm_vi<bits<2> msz, string asm, Operand imm_ty>
let Inst{3-0} = prfop;
}
-multiclass sve_mem_32b_prfm_vi<bits<2> msz, string asm, Operand imm_ty> {
+multiclass sve_mem_32b_prfm_vi<bits<2> msz, string asm, Operand imm_ty, SDPatternOperator op> {
def NAME : sve_mem_32b_prfm_vi<msz, asm, imm_ty>;
def : InstAlias<asm # "\t$prfop, $Pg, [$Zn]",
(!cast<Instruction>(NAME) sve_prfop:$prfop, PPR3bAny:$Pg, ZPR32:$Zn, 0), 1>;
+
+ def : Pat<(op (nxv4i1 PPR_3b:$Pg), (nxv4i32 ZPR32:$Zn), (i64 imm_ty:$imm), (i32 sve_prfop:$prfop)),
+ (!cast<Instruction>(NAME) sve_prfop:$prfop, PPR_3b:$Pg, ZPR32:$Zn, imm_ty:$imm)>;
}
class sve_mem_z_fill<string asm>
@@ -6798,14 +6809,27 @@ class sve_mem_64b_prfm_sv<bits<2> msz, bit xs, bit lsl, string asm,
multiclass sve_mem_64b_prfm_sv_ext_scaled<bits<2> msz, string asm,
RegisterOperand sxtw_opnd,
- RegisterOperand uxtw_opnd> {
+ RegisterOperand uxtw_opnd,
+ PatFrag op_sxtw,
+ PatFrag op_uxtw> {
def _UXTW_SCALED : sve_mem_64b_prfm_sv<msz, 0, 0, asm, uxtw_opnd>;
def _SXTW_SCALED : sve_mem_64b_prfm_sv<msz, 1, 0, asm, sxtw_opnd>;
+
+ def : Pat<(op_uxtw (nxv2i1 PPR3bAny:$Pg), (i64 GPR64sp:$Rn), (nxv2i64 uxtw_opnd:$Zm), (i32 sve_prfop:$prfop)),
+ (!cast<Instruction>(NAME # _UXTW_SCALED) sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm)>;
+
+ def : Pat<(op_sxtw (nxv2i1 PPR3bAny:$Pg), (i64 GPR64sp:$Rn), (nxv2i64 sxtw_opnd:$Zm), (i32 sve_prfop:$prfop)),
+ (!cast<Instruction>(NAME # _SXTW_SCALED) sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm)>;
+
}
multiclass sve_mem_64b_prfm_sv_lsl_scaled<bits<2> msz, string asm,
- RegisterOperand zprext> {
+ RegisterOperand zprext, PatFrag frag> {
def NAME : sve_mem_64b_prfm_sv<msz, 1, 1, asm, zprext>;
+
+ def : Pat<(frag (nxv2i1 PPR3bAny:$Pg), (i64 GPR64sp:$Rn), (nxv2i64 zprext:$Zm), (i32 sve_prfop:$prfop)),
+ (!cast<Instruction>(NAME) sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm)>;
+
}
@@ -6831,11 +6855,14 @@ class sve_mem_64b_prfm_vi<bits<2> msz, string asm, Operand imm_ty>
let hasSideEffects = 1;
}
-multiclass sve_mem_64b_prfm_vi<bits<2> msz, string asm, Operand imm_ty> {
+multiclass sve_mem_64b_prfm_vi<bits<2> msz, string asm, Operand imm_ty, SDPatternOperator op> {
def NAME : sve_mem_64b_prfm_vi<msz, asm, imm_ty>;
def : InstAlias<asm # "\t$prfop, $Pg, [$Zn]",
(!cast<Instruction>(NAME) sve_prfop:$prfop, PPR3bAny:$Pg, ZPR64:$Zn, 0), 1>;
+
+ def : Pat<(op (nxv2i1 PPR_3b:$Pg), (nxv2i64 ZPR32:$Zn), (i64 imm_ty:$imm), (i32 sve_prfop:$prfop)),
+ (!cast<Instruction>(NAME) sve_prfop:$prfop, PPR_3b:$Pg, ZPR32:$Zn, imm_ty:$imm)>;
}
//===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-prefetches-scaled-offset.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-prefetches-scaled-offset.ll
new file mode 100644
index 000000000000..78251707a010
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-prefetches-scaled-offset.ll
@@ -0,0 +1,200 @@
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve --asm-verbose=false < %s | FileCheck %s
+
+; PRFB <prfop>, <Pg>, [<Xn|SP>, <Zm>.S, <mod>] -> 32-bit scaled offset
+define void @llvm_aarch64_sve_gather_prfb_scaled_uxtw_nx4vi32(<vscale x 4 x i1> %Pg, i8* %base, <vscale x 4 x i32> %offset) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfb_scaled_uxtw_nx4vi32:
+; CHECK-NEXT: prfb pldl1strm, p0, [x0, z0.s, uxtw]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfb.scaled.uxtw.nx4vi32(<vscale x 4 x i1> %Pg, i8* %base, <vscale x 4 x i32> %offset, i32 1)
+ ret void
+ }
+
+define void @llvm_aarch64_sve_gather_prfb_scaled_sxtw_nx4vi32(<vscale x 4 x i1> %Pg, i8* %base, <vscale x 4 x i32> %offset) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfb_scaled_sxtw_nx4vi32:
+; CHECK-NEXT: prfb pldl1strm, p0, [x0, z0.s, sxtw]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfb.scaled.sxtw.nx4vi32(<vscale x 4 x i1> %Pg, i8* %base, <vscale x 4 x i32> %offset, i32 1)
+ ret void
+ }
+
+; PRFB <prfop>, <Pg>, [<Xn|SP>, <Zm>.D, <mod>] -> 32-bit unpacked scaled offset
+
+define void @llvm_aarch64_sve_gather_prfb_scaled_uxtw_nx2vi64(<vscale x 2 x i1> %Pg, i8* %base, <vscale x 2 x i32> %offset) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfb_scaled_uxtw_nx2vi64:
+; CHECK-NEXT: prfb pldl1strm, p0, [x0, z0.d, uxtw]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfb.scaled.uxtw.nx2vi64(<vscale x 2 x i1> %Pg, i8* %base, <vscale x 2 x i32> %offset, i32 1)
+ ret void
+ }
+
+define void @llvm_aarch64_sve_gather_prfb_scaled_sxtw_nx2vi64(<vscale x 2 x i1> %Pg, i8* %base, <vscale x 2 x i32> %offset) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfb_scaled_sxtw_nx2vi64:
+; CHECK-NEXT: prfb pldl1strm, p0, [x0, z0.d, sxtw]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfb.scaled.sxtw.nx2vi64(<vscale x 2 x i1> %Pg, i8* %base, <vscale x 2 x i32> %offset, i32 1)
+ ret void
+ }
+; PRFB <prfop>, <Pg>, [<Xn|SP>, <Zm>.D] -> 64-bit scaled offset
+define void @llvm_aarch64_sve_gather_prfb_scaled_nx2vi64(<vscale x 2 x i1> %Pg, i8* %base, <vscale x 2 x i64> %offset) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfb_scaled_nx2vi64:
+; CHECK-NEXT: prfb pldl1strm, p0, [x0, z0.d]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfb.scaled.nx2vi64(<vscale x 2 x i1> %Pg, i8* %base, <vscale x 2 x i64> %offset, i32 1)
+ ret void
+ }
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; PRFH <prfop>, <Pg>, [<Xn|SP>, <Zm>.S, <mod>] -> 32-bit scaled offset
+define void @llvm_aarch64_sve_gather_prfh_scaled_uxtw_nx4vi32(<vscale x 4 x i1> %Pg, i8* %base, <vscale x 4 x i32> %offset) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_scaled_uxtw_nx4vi32:
+; CHECK-NEXT: prfh pldl1strm, p0, [x0, z0.s, uxtw #1]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfh.scaled.uxtw.nx4vi32(<vscale x 4 x i1> %Pg, i8* %base, <vscale x 4 x i32> %offset, i32 1)
+ ret void
+ }
+
+define void @llvm_aarch64_sve_gather_prfh_scaled_sxtw_nx4vi32(<vscale x 4 x i1> %Pg, i8* %base, <vscale x 4 x i32> %offset) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_scaled_sxtw_nx4vi32:
+; CHECK-NEXT: prfh pldl1strm, p0, [x0, z0.s, sxtw #1]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfh.scaled.sxtw.nx4vi32(<vscale x 4 x i1> %Pg, i8* %base, <vscale x 4 x i32> %offset, i32 1)
+ ret void
+ }
+
+; PRFH <prfop>, <Pg>, [<Xn|SP>, <Zm>.D, <mod> #1] -> 32-bit unpacked scaled offset
+define void @llvm_aarch64_sve_gather_prfh_scaled_uxtw_nx2vi64(<vscale x 2 x i1> %Pg, i8* %base, <vscale x 2 x i32> %offset) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_scaled_uxtw_nx2vi64:
+; CHECK-NEXT: prfh pldl1strm, p0, [x0, z0.d, uxtw #1]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfh.scaled.uxtw.nx2vi64(<vscale x 2 x i1> %Pg, i8* %base, <vscale x 2 x i32> %offset, i32 1)
+ ret void
+ }
+
+define void @llvm_aarch64_sve_gather_prfh_scaled_sxtw_nx2vi64(<vscale x 2 x i1> %Pg, i8* %base, <vscale x 2 x i32> %offset) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_scaled_sxtw_nx2vi64:
+; CHECK-NEXT: prfh pldl1strm, p0, [x0, z0.d, sxtw #1]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfh.scaled.sxtw.nx2vi64(<vscale x 2 x i1> %Pg, i8* %base, <vscale x 2 x i32> %offset, i32 1)
+ ret void
+ }
+
+; PRFH <prfop>, <Pg>, [<Xn|SP>, <Zm>.D] -> 64-bit scaled offset
+define void @llvm_aarch64_sve_gather_prfh_scaled_nx2vi64(<vscale x 2 x i1> %Pg, i8* %base, <vscale x 2 x i64> %offset) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_scaled_nx2vi64:
+; CHECK-NEXT: prfh pldl1strm, p0, [x0, z0.d, lsl #1]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfh.scaled.nx2vi64(<vscale x 2 x i1> %Pg, i8* %base, <vscale x 2 x i64> %offset, i32 1)
+ ret void
+ }
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; PRFW <prfop>, <Pg>, [<Xn|SP>, <Zm>.S, <mod>] -> 32-bit scaled offset
+define void @llvm_aarch64_sve_gather_prfw_scaled_uxtw_nx4vi32(<vscale x 4 x i1> %Pg, i8* %base, <vscale x 4 x i32> %offset) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_scaled_uxtw_nx4vi32:
+; CHECK-NEXT: prfw pldl1strm, p0, [x0, z0.s, uxtw #2]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfw.scaled.uxtw.nx4vi32(<vscale x 4 x i1> %Pg, i8* %base, <vscale x 4 x i32> %offset, i32 1)
+ ret void
+ }
+
+define void @llvm_aarch64_sve_gather_prfw_scaled_sxtw_nx4vi32(<vscale x 4 x i1> %Pg, i8* %base, <vscale x 4 x i32> %offset) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_scaled_sxtw_nx4vi32:
+; CHECK-NEXT: prfw pldl1strm, p0, [x0, z0.s, sxtw #2]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfw.scaled.sxtw.nx4vi32(<vscale x 4 x i1> %Pg, i8* %base, <vscale x 4 x i32> %offset, i32 1)
+ ret void
+ }
+
+; PRFW <prfop>, <Pg>, [<Xn|SP>, <Zm>.D, <mod> #2] -> 32-bit unpacked scaled offset
+define void @llvm_aarch64_sve_gather_prfw_scaled_uxtw_nx2vi64(<vscale x 2 x i1> %Pg, i8* %base, <vscale x 2 x i32> %offset) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_scaled_uxtw_nx2vi64:
+; CHECK-NEXT: prfw pldl1strm, p0, [x0, z0.d, uxtw #2]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfw.scaled.uxtw.nx2vi64(<vscale x 2 x i1> %Pg, i8* %base, <vscale x 2 x i32> %offset, i32 1)
+ ret void
+ }
+
+define void @llvm_aarch64_sve_gather_prfw_scaled_sxtw_nx2vi64(<vscale x 2 x i1> %Pg, i8* %base, <vscale x 2 x i32> %offset) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_scaled_sxtw_nx2vi64:
+; CHECK-NEXT: prfw pldl1strm, p0, [x0, z0.d, sxtw #2]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfw.scaled.sxtw.nx2vi64(<vscale x 2 x i1> %Pg, i8* %base, <vscale x 2 x i32> %offset, i32 1)
+ ret void
+ }
+
+; PRFW <prfop>, <Pg>, [<Xn|SP>, <Zm>.D] -> 64-bit scaled offset
+define void @llvm_aarch64_sve_gather_prfw_scaled_nx2vi64(<vscale x 2 x i1> %Pg, i8* %base, <vscale x 2 x i64> %offset) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_scaled_nx2vi64:
+; CHECK-NEXT: prfw pldl1strm, p0, [x0, z0.d, lsl #2]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfw.scaled.nx2vi64(<vscale x 2 x i1> %Pg, i8* %base, <vscale x 2 x i64> %offset, i32 1)
+ ret void
+ }
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; PRFD <prfop>, <Pg>, [<Xn|SP>, <Zm>.S, <mod>] -> 32-bit scaled offset
+define void @llvm_aarch64_sve_gather_prfd_scaled_uxtw_nx4vi32(<vscale x 4 x i1> %Pg, i8* %base, <vscale x 4 x i32> %offset) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_scaled_uxtw_nx4vi32:
+; CHECK-NEXT: prfd pldl1strm, p0, [x0, z0.s, uxtw #3]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfd.scaled.uxtw.nx4vi32(<vscale x 4 x i1> %Pg, i8* %base, <vscale x 4 x i32> %offset, i32 1)
+ ret void
+ }
+
+define void @llvm_aarch64_sve_gather_prfd_scaled_sxtw_nx4vi32(<vscale x 4 x i1> %Pg, i8* %base, <vscale x 4 x i32> %offset) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_scaled_sxtw_nx4vi32:
+; CHECK-NEXT: prfd pldl1strm, p0, [x0, z0.s, sxtw #3]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfd.scaled.sxtw.nx4vi32(<vscale x 4 x i1> %Pg, i8* %base, <vscale x 4 x i32> %offset, i32 1)
+ ret void
+ }
+
+; PRFD <prfop>, <Pg>, [<Xn|SP>, <Zm>.D, <mod> #3] -> 32-bit unpacked scaled offset
+define void @llvm_aarch64_sve_gather_prfd_scaled_uxtw_nx2vi64(<vscale x 2 x i1> %Pg, i8* %base, <vscale x 2 x i32> %offset) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_scaled_uxtw_nx2vi64:
+; CHECK-NEXT: prfd pldl1strm, p0, [x0, z0.d, uxtw #3]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfd.scaled.uxtw.nx2vi64(<vscale x 2 x i1> %Pg, i8* %base, <vscale x 2 x i32> %offset, i32 1)
+ ret void
+ }
+
+define void @llvm_aarch64_sve_gather_prfd_scaled_sxtw_nx2vi64(<vscale x 2 x i1> %Pg, i8* %base, <vscale x 2 x i32> %offset) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_scaled_sxtw_nx2vi64:
+; CHECK-NEXT: prfd pldl1strm, p0, [x0, z0.d, sxtw #3]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfd.scaled.sxtw.nx2vi64(<vscale x 2 x i1> %Pg, i8* %base, <vscale x 2 x i32> %offset, i32 1)
+ ret void
+ }
+
+; PRFD <prfop>, <Pg>, [<Xn|SP>, <Zm>.D] -> 64-bit scaled offset
+define void @llvm_aarch64_sve_gather_prfd_scaled_nx2vi64(<vscale x 2 x i1> %Pg, i8* %base, <vscale x 2 x i64> %offset) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_scaled_nx2vi64:
+; CHECK-NEXT: prfd pldl1strm, p0, [x0, z0.d, lsl #3]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfd.scaled.nx2vi64(<vscale x 2 x i1> %Pg, i8* %base, <vscale x 2 x i64> %offset, i32 1)
+ ret void
+ }
+
+declare void @llvm.aarch64.sve.gather.prfb.scaled.uxtw.nx4vi32(<vscale x 4 x i1> %Pg, i8* %base, <vscale x 4 x i32> %offset, i32 %prfop)
+declare void @llvm.aarch64.sve.gather.prfb.scaled.sxtw.nx4vi32(<vscale x 4 x i1> %Pg, i8* %base, <vscale x 4 x i32> %offset, i32 %prfop)
+declare void @llvm.aarch64.sve.gather.prfb.scaled.uxtw.nx2vi64(<vscale x 2 x i1> %Pg, i8* %base, <vscale x 2 x i32> %offset, i32 %prfop)
+declare void @llvm.aarch64.sve.gather.prfb.scaled.sxtw.nx2vi64(<vscale x 2 x i1> %Pg, i8* %base, <vscale x 2 x i32> %offset, i32 %prfop)
+declare void @llvm.aarch64.sve.gather.prfb.scaled.nx2vi64(<vscale x 2 x i1> %Pg, i8* %base, <vscale x 2 x i64> %offset, i32 %prfop)
+declare void @llvm.aarch64.sve.gather.prfh.scaled.uxtw.nx4vi32(<vscale x 4 x i1> %Pg, i8* %base, <vscale x 4 x i32> %offset, i32 %prfop)
+declare void @llvm.aarch64.sve.gather.prfh.scaled.sxtw.nx4vi32(<vscale x 4 x i1> %Pg, i8* %base, <vscale x 4 x i32> %offset, i32 %prfop)
+declare void @llvm.aarch64.sve.gather.prfh.scaled.uxtw.nx2vi64(<vscale x 2 x i1> %Pg, i8* %base, <vscale x 2 x i32> %offset, i32 %prfop)
+declare void @llvm.aarch64.sve.gather.prfh.scaled.sxtw.nx2vi64(<vscale x 2 x i1> %Pg, i8* %base, <vscale x 2 x i32> %offset, i32 %prfop)
+declare void @llvm.aarch64.sve.gather.prfh.scaled.nx2vi64(<vscale x 2 x i1> %Pg, i8* %base, <vscale x 2 x i64> %offset, i32 %prfop)
+declare void @llvm.aarch64.sve.gather.prfw.scaled.uxtw.nx4vi32(<vscale x 4 x i1> %Pg, i8* %base, <vscale x 4 x i32> %offset, i32 %prfop)
+declare void @llvm.aarch64.sve.gather.prfw.scaled.sxtw.nx4vi32(<vscale x 4 x i1> %Pg, i8* %base, <vscale x 4 x i32> %offset, i32 %prfop)
+declare void @llvm.aarch64.sve.gather.prfw.scaled.uxtw.nx2vi64(<vscale x 2 x i1> %Pg, i8* %base, <vscale x 2 x i32> %offset, i32 %prfop)
+declare void @llvm.aarch64.sve.gather.prfw.scaled.sxtw.nx2vi64(<vscale x 2 x i1> %Pg, i8* %base, <vscale x 2 x i32> %offset, i32 %prfop)
+declare void @llvm.aarch64.sve.gather.prfw.scaled.nx2vi64(<vscale x 2 x i1> %Pg, i8* %base, <vscale x 2 x i64> %offset, i32 %prfop)
+declare void @llvm.aarch64.sve.gather.prfd.scaled.uxtw.nx4vi32(<vscale x 4 x i1> %Pg, i8* %base, <vscale x 4 x i32> %offset, i32 %prfop)
+declare void @llvm.aarch64.sve.gather.prfd.scaled.sxtw.nx4vi32(<vscale x 4 x i1> %Pg, i8* %base, <vscale x 4 x i32> %offset, i32 %prfop)
+declare void @llvm.aarch64.sve.gather.prfd.scaled.uxtw.nx2vi64(<vscale x 2 x i1> %Pg, i8* %base, <vscale x 2 x i32> %offset, i32 %prfop)
+declare void @llvm.aarch64.sve.gather.prfd.scaled.sxtw.nx2vi64(<vscale x 2 x i1> %Pg, i8* %base, <vscale x 2 x i32> %offset, i32 %prfop)
+declare void @llvm.aarch64.sve.gather.prfd.scaled.nx2vi64(<vscale x 2 x i1> %Pg, i8* %base, <vscale x 2 x i64> %offset, i32 %prfop)
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-prefetches-vect-base-imm-offset.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-prefetches-vect-base-imm-offset.ll
new file mode 100644
index 000000000000..481302ce5972
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-prefetches-vect-base-imm-offset.ll
@@ -0,0 +1,82 @@
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve --asm-verbose=false < %s | FileCheck %s
+
+; PRFB <prfop>, <Pg>, [<Zn>.S{, #<imm>}] -> 32-bit element
+define void @llvm_aarch64_sve_gather_prfb_nx4vi32(<vscale x 4 x i32> %bases, <vscale x 4 x i1> %Pg) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfb_nx4vi32:
+; CHECK-NEXT: prfb pldl1strm, p0, [z0.s, #7]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfb.nx4vi32(<vscale x 4 x i1> %Pg, <vscale x 4 x i32> %bases, i64 7, i32 1)
+ ret void
+}
+
+; PRFB <prfop>, <Pg>, [<Zn>.D{, #<imm>}] -> 64-bit element
+define void @llvm_aarch64_sve_gather_prfb_nx2vi64(<vscale x 2 x i64> %bases, <vscale x 2 x i1> %Pg) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfb_nx2vi64:
+; CHECK-NEXT: prfb pldl1strm, p0, [z0.d, #7]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfb.nx2vi64(<vscale x 2 x i1> %Pg, <vscale x 2 x i64> %bases, i64 7, i32 1)
+ ret void
+}
+
+; PRFH <prfop>, <Pg>, [<Zn>.S{, #<imm>}] -> 32-bit element
+define void @llvm_aarch64_sve_gather_prfh_nx4vi32(<vscale x 4 x i32> %bases, <vscale x 4 x i1> %Pg) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_nx4vi32:
+; CHECK-NEXT: prfh pldl1strm, p0, [z0.s, #6]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfh.nx4vi32(<vscale x 4 x i1> %Pg, <vscale x 4 x i32> %bases, i64 6, i32 1)
+ ret void
+}
+
+; PRFH <prfop>, <Pg>, [<Zn>.D{, #<imm>}] -> 64-bit element
+define void @llvm_aarch64_sve_gather_prfh_nx2vi64(<vscale x 2 x i64> %bases, <vscale x 2 x i1> %Pg) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_nx2vi64:
+; CHECK-NEXT: prfh pldl1strm, p0, [z0.d, #6]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfh.nx2vi64(<vscale x 2 x i1> %Pg, <vscale x 2 x i64> %bases, i64 6, i32 1)
+ ret void
+}
+
+; PRFW <prfop>, <Pg>, [<Zn>.S{, #<imm>}] -> 32-bit element
+define void @llvm_aarch64_sve_gather_prfw_nx4vi32(<vscale x 4 x i32> %bases, <vscale x 4 x i1> %Pg) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_nx4vi32:
+; CHECK-NEXT: prfw pldl1strm, p0, [z0.s, #12]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfw.nx4vi32(<vscale x 4 x i1> %Pg, <vscale x 4 x i32> %bases, i64 12, i32 1)
+ ret void
+}
+
+; PRFW <prfop>, <Pg>, [<Zn>.D{, #<imm>}] -> 64-bit element
+define void @llvm_aarch64_sve_gather_prfw_nx2vi64(<vscale x 2 x i64> %bases, <vscale x 2 x i1> %Pg) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_nx2vi64:
+; CHECK-NEXT: prfw pldl1strm, p0, [z0.d, #12]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfw.nx2vi64(<vscale x 2 x i1> %Pg, <vscale x 2 x i64> %bases, i64 12, i32 1)
+ ret void
+}
+
+; PRFD <prfop>, <Pg>, [<Zn>.S{, #<imm>}] -> 32-bit element
+define void @llvm_aarch64_sve_gather_prfd_nx4vi32(<vscale x 4 x i32> %bases, <vscale x 4 x i1> %Pg) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_nx4vi32:
+; CHECK-NEXT: prfd pldl1strm, p0, [z0.s, #16]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfd.nx4vi32(<vscale x 4 x i1> %Pg, <vscale x 4 x i32> %bases, i64 16, i32 1)
+ ret void
+}
+
+; PRFD <prfop>, <Pg>, [<Zn>.D{, #<imm>}] -> 64-bit element
+define void @llvm_aarch64_sve_gather_prfd_nx2vi64(<vscale x 2 x i64> %bases, <vscale x 2 x i1> %Pg) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_nx2vi64:
+; CHECK-NEXT: prfd pldl1strm, p0, [z0.d, #16]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfd.nx2vi64(<vscale x 2 x i1> %Pg, <vscale x 2 x i64> %bases, i64 16, i32 1)
+ ret void
+}
+
+declare void @llvm.aarch64.sve.gather.prfb.nx4vi32(<vscale x 4 x i1> %Pg, <vscale x 4 x i32> %bases, i64 %imm, i32 %prfop)
+declare void @llvm.aarch64.sve.gather.prfb.nx2vi64(<vscale x 2 x i1> %Pg, <vscale x 2 x i64> %bases, i64 %imm, i32 %prfop)
+declare void @llvm.aarch64.sve.gather.prfh.nx4vi32(<vscale x 4 x i1> %Pg, <vscale x 4 x i32> %bases, i64 %imm, i32 %prfop)
+declare void @llvm.aarch64.sve.gather.prfh.nx2vi64(<vscale x 2 x i1> %Pg, <vscale x 2 x i64> %bases, i64 %imm, i32 %prfop)
+declare void @llvm.aarch64.sve.gather.prfw.nx4vi32(<vscale x 4 x i1> %Pg, <vscale x 4 x i32> %bases, i64 %imm, i32 %prfop)
+declare void @llvm.aarch64.sve.gather.prfw.nx2vi64(<vscale x 2 x i1> %Pg, <vscale x 2 x i64> %bases, i64 %imm, i32 %prfop)
+declare void @llvm.aarch64.sve.gather.prfd.nx4vi32(<vscale x 4 x i1> %Pg, <vscale x 4 x i32> %bases, i64 %imm, i32 %prfop)
+declare void @llvm.aarch64.sve.gather.prfd.nx2vi64(<vscale x 2 x i1> %Pg, <vscale x 2 x i64> %bases, i64 %imm, i32 %prfop)
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-prefetches-vect-base-invalid-imm-offset.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-prefetches-vect-base-invalid-imm-offset.ll
new file mode 100644
index 000000000000..4b0b42eb73b9
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-prefetches-vect-base-invalid-imm-offset.ll
@@ -0,0 +1,286 @@
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve --asm-verbose=false < %s | FileCheck %s
+
+; PRFB <prfop>, <Pg>, [<Zn>.S{, #<imm>}] -> 32-bit element, imm = 0, 1, ..., 31
+define void @llvm_aarch64_sve_gather_prfb_nx4vi32_runtime_offset(<vscale x 4 x i32> %bases, i64 %imm, <vscale x 4 x i1> %Pg) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfb_nx4vi32_runtime_offset:
+; CHECK-NEXT: prfb pldl1strm, p0, [x0, z0.s, uxtw]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfb.nx4vi32(<vscale x 4 x i1> %Pg, <vscale x 4 x i32> %bases, i64 %imm, i32 1)
+ ret void
+}
+
+define void @llvm_aarch64_sve_gather_prfb_nx4vi32_invalid_immediate_offset_upper_bound(<vscale x 4 x i32> %bases, <vscale x 4 x i1> %Pg) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfb_nx4vi32_invalid_immediate_offset_upper_bound:
+; CHECK-NEXT: mov w[[N:[0-9]+]], #32
+; CHECK-NEXT: prfb pldl1strm, p0, [x[[N]], z0.s, uxtw]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfb.nx4vi32(<vscale x 4 x i1> %Pg, <vscale x 4 x i32> %bases, i64 32, i32 1)
+ ret void
+}
+
+define void @llvm_aarch64_sve_gather_prfb_nx4vi32_invalid_immediate_offset_lower_bound(<vscale x 4 x i32> %bases, <vscale x 4 x i1> %Pg) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfb_nx4vi32_invalid_immediate_offset_lower_bound:
+; CHECK-NEXT: mov x[[N:[0-9]+]], #-1
+; CHECK-NEXT: prfb pldl1strm, p0, [x[[N:[0-9]+]], z0.s, uxtw]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfb.nx4vi32(<vscale x 4 x i1> %Pg, <vscale x 4 x i32> %bases, i64 -1, i32 1)
+ ret void
+}
+
+; PRFB <prfop>, <Pg>, [<Zn>.D{, #<imm>}] -> 64-bit element, imm = 0, 1, ..., 31
+define void @llvm_aarch64_sve_gather_prfb_nx2vi64_runtime_offset(<vscale x 2 x i64> %bases, i64 %imm, <vscale x 2 x i1> %Pg) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfb_nx2vi64_runtime_offset:
+; CHECK-NEXT: prfb pldl1strm, p0, [x0, z0.d, uxtw]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfb.nx2vi64(<vscale x 2 x i1> %Pg, <vscale x 2 x i64> %bases, i64 %imm, i32 1)
+ ret void
+}
+
+define void @llvm_aarch64_sve_gather_prfb_nx2vi64_invalid_immediate_offset_upper_bound(<vscale x 2 x i64> %bases, <vscale x 2 x i1> %Pg) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfb_nx2vi64_invalid_immediate_offset_upper_bound:
+; CHECK-NEXT: mov w[[N:[0-9]+]], #32
+; CHECK-NEXT: prfb pldl1strm, p0, [x[[N]], z0.d, uxtw]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfb.nx2vi64(<vscale x 2 x i1> %Pg, <vscale x 2 x i64> %bases, i64 32, i32 1)
+ ret void
+}
+
+define void @llvm_aarch64_sve_gather_prfb_nx2vi64_invalid_immediate_offset_lower_bound(<vscale x 2 x i64> %bases, <vscale x 2 x i1> %Pg) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfb_nx2vi64_invalid_immediate_offset_lower_bound:
+; CHECK-NEXT: mov x[[N:[0-9]+]], #-1
+; CHECK-NEXT: prfb pldl1strm, p0, [x[[N:[0-9]+]], z0.d, uxtw]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfb.nx2vi64(<vscale x 2 x i1> %Pg, <vscale x 2 x i64> %bases, i64 -1, i32 1)
+ ret void
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; PRFH <prfop>, <Pg>, [<Zn>.S{, #<imm>}] -> 32-bit element, imm = 0, 2, ..., 62
+define void @llvm_aarch64_sve_gather_prfh_nx4vi32_runtime_offset(<vscale x 4 x i32> %bases, i64 %imm, <vscale x 4 x i1> %Pg) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_nx4vi32_runtime_offset:
+; CHECK-NEXT: prfh pldl1strm, p0, [x0, z0.s, uxtw #1]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfh.nx4vi32(<vscale x 4 x i1> %Pg, <vscale x 4 x i32> %bases, i64 %imm, i32 1)
+ ret void
+}
+
+define void @llvm_aarch64_sve_gather_prfh_nx4vi32_invalid_immediate_offset_upper_bound(<vscale x 4 x i32> %bases, <vscale x 4 x i1> %Pg) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_nx4vi32_invalid_immediate_offset_upper_bound:
+; CHECK-NEXT: mov w[[N:[0-9]+]], #63
+; CHECK-NEXT: prfh pldl1strm, p0, [x[[N]], z0.s, uxtw #1]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfh.nx4vi32(<vscale x 4 x i1> %Pg, <vscale x 4 x i32> %bases, i64 63, i32 1)
+ ret void
+}
+
+define void @llvm_aarch64_sve_gather_prfh_nx4vi32_invalid_immediate_offset_lower_bound(<vscale x 4 x i32> %bases, <vscale x 4 x i1> %Pg) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_nx4vi32_invalid_immediate_offset_lower_bound:
+; CHECK-NEXT: mov x[[N:[0-9]+]], #-1
+; CHECK-NEXT: prfh pldl1strm, p0, [x[[N:[0-9]+]], z0.s, uxtw #1]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfh.nx4vi32(<vscale x 4 x i1> %Pg, <vscale x 4 x i32> %bases, i64 -1, i32 1)
+ ret void
+}
+
+define void @llvm_aarch64_sve_gather_prfh_nx4vi32_invalid_immediate_offset_inbound_not_multiple_of_2(<vscale x 4 x i32> %bases, <vscale x 4 x i1> %Pg) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_nx4vi32_invalid_immediate_offset_inbound_not_multiple_of_2:
+; CHECK-NEXT: mov w[[N:[0-9]+]], #33
+; CHECK-NEXT: prfh pldl1strm, p0, [x[[N:[0-9]+]], z0.s, uxtw #1]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfh.nx4vi32(<vscale x 4 x i1> %Pg, <vscale x 4 x i32> %bases, i64 33, i32 1)
+ ret void
+}
+
+; PRFH <prfop>, <Pg>, [<Zn>.D{, #<imm>}] -> 64-bit element, imm = 0, 2, ..., 62
+define void @llvm_aarch64_sve_gather_prfh_nx2vi64_runtime_offset(<vscale x 2 x i64> %bases, i64 %imm, <vscale x 2 x i1> %Pg) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_nx2vi64_runtime_offset:
+; CHECK-NEXT: prfh pldl1strm, p0, [x0, z0.d, uxtw #1]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfh.nx2vi64(<vscale x 2 x i1> %Pg, <vscale x 2 x i64> %bases, i64 %imm, i32 1)
+ ret void
+}
+
+define void @llvm_aarch64_sve_gather_prfh_nx2vi64_invalid_immediate_offset_upper_bound(<vscale x 2 x i64> %bases, <vscale x 2 x i1> %Pg) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_nx2vi64_invalid_immediate_offset_upper_bound:
+; CHECK-NEXT: mov w[[N:[0-9]+]], #63
+; CHECK-NEXT: prfh pldl1strm, p0, [x[[N]], z0.d, uxtw #1]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfh.nx2vi64(<vscale x 2 x i1> %Pg, <vscale x 2 x i64> %bases, i64 63, i32 1)
+ ret void
+}
+
+define void @llvm_aarch64_sve_gather_prfh_nx2vi64_invalid_immediate_offset_lower_bound(<vscale x 2 x i64> %bases, <vscale x 2 x i1> %Pg) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_nx2vi64_invalid_immediate_offset_lower_bound:
+; CHECK-NEXT: mov x[[N:[0-9]+]], #-1
+; CHECK-NEXT: prfh pldl1strm, p0, [x[[N:[0-9]+]], z0.d, uxtw #1]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfh.nx2vi64(<vscale x 2 x i1> %Pg, <vscale x 2 x i64> %bases, i64 -1, i32 1)
+ ret void
+}
+
+define void @llvm_aarch64_sve_gather_prfh_nx2vi64_invalid_immediate_offset_inbound_not_multiple_of_2(<vscale x 2 x i64> %bases, <vscale x 2 x i1> %Pg) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_nx2vi64_invalid_immediate_offset_inbound_not_multiple_of_2:
+; CHECK-NEXT: mov w[[N:[0-9]+]], #33
+; CHECK-NEXT: prfh pldl1strm, p0, [x[[N:[0-9]+]], z0.d, uxtw #1]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfh.nx2vi64(<vscale x 2 x i1> %Pg, <vscale x 2 x i64> %bases, i64 33, i32 1)
+ ret void
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; PRFW <prfop>, <Pg>, [<Zn>.S{, #<imm>}] -> 32-bit element, imm = 0, 4, ..., 124
+define void @llvm_aarch64_sve_gather_prfw_nx4vi32_runtime_offset(<vscale x 4 x i32> %bases, i64 %imm, <vscale x 4 x i1> %Pg) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_nx4vi32_runtime_offset:
+; CHECK-NEXT: prfw pldl1strm, p0, [x0, z0.s, uxtw #2]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfw.nx4vi32(<vscale x 4 x i1> %Pg, <vscale x 4 x i32> %bases, i64 %imm, i32 1)
+ ret void
+}
+
+define void @llvm_aarch64_sve_gather_prfw_nx4vi32_invalid_immediate_offset_upper_bound(<vscale x 4 x i32> %bases, <vscale x 4 x i1> %Pg) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_nx4vi32_invalid_immediate_offset_upper_bound:
+; CHECK-NEXT: mov w[[N:[0-9]+]], #125
+; CHECK-NEXT: prfw pldl1strm, p0, [x[[N]], z0.s, uxtw #2]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfw.nx4vi32(<vscale x 4 x i1> %Pg, <vscale x 4 x i32> %bases, i64 125, i32 1)
+ ret void
+}
+
+define void @llvm_aarch64_sve_gather_prfw_nx4vi32_invalid_immediate_offset_lower_bound(<vscale x 4 x i32> %bases, <vscale x 4 x i1> %Pg) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_nx4vi32_invalid_immediate_offset_lower_bound:
+; CHECK-NEXT: mov x[[N:[0-9]+]], #-1
+; CHECK-NEXT: prfw pldl1strm, p0, [x[[N:[0-9]+]], z0.s, uxtw #2]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfw.nx4vi32(<vscale x 4 x i1> %Pg, <vscale x 4 x i32> %bases, i64 -1, i32 1)
+ ret void
+}
+
+define void @llvm_aarch64_sve_gather_prfw_nx4vi32_invalid_immediate_offset_inbound_not_multiple_of_4(<vscale x 4 x i32> %bases, <vscale x 4 x i1> %Pg) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_nx4vi32_invalid_immediate_offset_inbound_not_multiple_of_4:
+; CHECK-NEXT: mov w[[N:[0-9]+]], #33
+; CHECK-NEXT: prfw pldl1strm, p0, [x[[N:[0-9]+]], z0.s, uxtw #2]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfw.nx4vi32(<vscale x 4 x i1> %Pg, <vscale x 4 x i32> %bases, i64 33, i32 1)
+ ret void
+}
+
+; PRFW <prfop>, <Pg>, [<Zn>.D{, #<imm>}] -> 64-bit element, imm = 0, 4, ..., 124
+define void @llvm_aarch64_sve_gather_prfw_nx2vi64_runtime_offset(<vscale x 2 x i64> %bases, i64 %imm, <vscale x 2 x i1> %Pg) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_nx2vi64_runtime_offset:
+; CHECK-NEXT: prfw pldl1strm, p0, [x0, z0.d, uxtw #2]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfw.nx2vi64(<vscale x 2 x i1> %Pg, <vscale x 2 x i64> %bases, i64 %imm, i32 1)
+ ret void
+}
+
+define void @llvm_aarch64_sve_gather_prfw_nx2vi64_invalid_immediate_offset_upper_bound(<vscale x 2 x i64> %bases, <vscale x 2 x i1> %Pg) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_nx2vi64_invalid_immediate_offset_upper_bound:
+; CHECK-NEXT: mov w[[N:[0-9]+]], #125
+; CHECK-NEXT: prfw pldl1strm, p0, [x[[N]], z0.d, uxtw #2]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfw.nx2vi64(<vscale x 2 x i1> %Pg, <vscale x 2 x i64> %bases, i64 125, i32 1)
+ ret void
+}
+
+define void @llvm_aarch64_sve_gather_prfw_nx2vi64_invalid_immediate_offset_lower_bound(<vscale x 2 x i64> %bases, <vscale x 2 x i1> %Pg) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_nx2vi64_invalid_immediate_offset_lower_bound:
+; CHECK-NEXT: mov x[[N:[0-9]+]], #-1
+; CHECK-NEXT: prfw pldl1strm, p0, [x[[N:[0-9]+]], z0.d, uxtw #2]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfw.nx2vi64(<vscale x 2 x i1> %Pg, <vscale x 2 x i64> %bases, i64 -1, i32 1)
+ ret void
+}
+
+define void @llvm_aarch64_sve_gather_prfw_nx2vi64_invalid_immediate_offset_inbound_not_multiple_of_4(<vscale x 2 x i64> %bases, <vscale x 2 x i1> %Pg) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_nx2vi64_invalid_immediate_offset_inbound_not_multiple_of_4:
+; CHECK-NEXT: mov w[[N:[0-9]+]], #33
+; CHECK-NEXT: prfw pldl1strm, p0, [x[[N:[0-9]+]], z0.d, uxtw #2]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfw.nx2vi64(<vscale x 2 x i1> %Pg, <vscale x 2 x i64> %bases, i64 33, i32 1)
+ ret void
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; PRFD <prfop>, <Pg>, [<Zn>.S{, #<imm>}] -> 32-bit element, imm = 0, 8, ..., 248
+define void @llvm_aarch64_sve_gather_prfd_nx4vi32_runtime_offset(<vscale x 4 x i32> %bases, i64 %imm, <vscale x 4 x i1> %Pg) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_nx4vi32_runtime_offset:
+; CHECK-NEXT: prfd pldl1strm, p0, [x0, z0.s, uxtw #3]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfd.nx4vi32(<vscale x 4 x i1> %Pg, <vscale x 4 x i32> %bases, i64 %imm, i32 1)
+ ret void
+}
+
+define void @llvm_aarch64_sve_gather_prfd_nx4vi32_invalid_immediate_offset_upper_bound(<vscale x 4 x i32> %bases, <vscale x 4 x i1> %Pg) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_nx4vi32_invalid_immediate_offset_upper_bound:
+; CHECK-NEXT: mov w[[N:[0-9]+]], #125
+; CHECK-NEXT: prfd pldl1strm, p0, [x[[N]], z0.s, uxtw #3]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfd.nx4vi32(<vscale x 4 x i1> %Pg, <vscale x 4 x i32> %bases, i64 125, i32 1)
+ ret void
+}
+
+define void @llvm_aarch64_sve_gather_prfd_nx4vi32_invalid_immediate_offset_lower_bound(<vscale x 4 x i32> %bases, <vscale x 4 x i1> %Pg) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_nx4vi32_invalid_immediate_offset_lower_bound:
+; CHECK-NEXT: mov x[[N:[0-9]+]], #-1
+; CHECK-NEXT: prfd pldl1strm, p0, [x[[N:[0-9]+]], z0.s, uxtw #3]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfd.nx4vi32(<vscale x 4 x i1> %Pg, <vscale x 4 x i32> %bases, i64 -1, i32 1)
+ ret void
+}
+
+define void @llvm_aarch64_sve_gather_prfd_nx4vi32_invalid_immediate_offset_inbound_not_multiple_of_8(<vscale x 4 x i32> %bases, <vscale x 4 x i1> %Pg) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_nx4vi32_invalid_immediate_offset_inbound_not_multiple_of_8:
+; CHECK-NEXT: mov w[[N:[0-9]+]], #33
+; CHECK-NEXT: prfd pldl1strm, p0, [x[[N:[0-9]+]], z0.s, uxtw #3]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfd.nx4vi32(<vscale x 4 x i1> %Pg, <vscale x 4 x i32> %bases, i64 33, i32 1)
+ ret void
+}
+
+; PRFD <prfop>, <Pg>, [<Zn>.D{, #<imm>}] -> 64-bit element, imm = 0, 4, ..., 248
+define void @llvm_aarch64_sve_gather_prfd_nx2vi64_runtime_offset(<vscale x 2 x i64> %bases, i64 %imm, <vscale x 2 x i1> %Pg) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_nx2vi64_runtime_offset:
+; CHECK-NEXT: prfd pldl1strm, p0, [x0, z0.d, uxtw #3]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfd.nx2vi64(<vscale x 2 x i1> %Pg, <vscale x 2 x i64> %bases, i64 %imm, i32 1)
+ ret void
+}
+
+define void @llvm_aarch64_sve_gather_prfd_nx2vi64_invalid_immediate_offset_upper_bound(<vscale x 2 x i64> %bases, <vscale x 2 x i1> %Pg) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_nx2vi64_invalid_immediate_offset_upper_bound:
+; CHECK-NEXT: mov w[[N:[0-9]+]], #125
+; CHECK-NEXT: prfd pldl1strm, p0, [x[[N]], z0.d, uxtw #3]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfd.nx2vi64(<vscale x 2 x i1> %Pg, <vscale x 2 x i64> %bases, i64 125, i32 1)
+ ret void
+}
+
+define void @llvm_aarch64_sve_gather_prfd_nx2vi64_invalid_immediate_offset_lower_bound(<vscale x 2 x i64> %bases, <vscale x 2 x i1> %Pg) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_nx2vi64_invalid_immediate_offset_lower_bound:
+; CHECK-NEXT: mov x[[N:[0-9]+]], #-1
+; CHECK-NEXT: prfd pldl1strm, p0, [x[[N:[0-9]+]], z0.d, uxtw #3]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfd.nx2vi64(<vscale x 2 x i1> %Pg, <vscale x 2 x i64> %bases, i64 -1, i32 1)
+ ret void
+}
+
+define void @llvm_aarch64_sve_gather_prfd_nx2vi64_invalid_immediate_offset_inbound_not_multiple_of_8(<vscale x 2 x i64> %bases, <vscale x 2 x i1> %Pg) nounwind {
+; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_nx2vi64_invalid_immediate_offset_inbound_not_multiple_of_8:
+; CHECK-NEXT: mov w[[N:[0-9]+]], #33
+; CHECK-NEXT: prfd pldl1strm, p0, [x[[N:[0-9]+]], z0.d, uxtw #3]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.gather.prfd.nx2vi64(<vscale x 2 x i1> %Pg, <vscale x 2 x i64> %bases, i64 33, i32 1)
+ ret void
+}
+
+declare void @llvm.aarch64.sve.gather.prfb.nx4vi32(<vscale x 4 x i1> %Pg, <vscale x 4 x i32> %bases, i64 %imm, i32 %prfop)
+declare void @llvm.aarch64.sve.gather.prfb.nx2vi64(<vscale x 2 x i1> %Pg, <vscale x 2 x i64> %bases, i64 %imm, i32 %prfop)
+declare void @llvm.aarch64.sve.gather.prfh.nx4vi32(<vscale x 4 x i1> %Pg, <vscale x 4 x i32> %bases, i64 %imm, i32 %prfop)
+declare void @llvm.aarch64.sve.gather.prfh.nx2vi64(<vscale x 2 x i1> %Pg, <vscale x 2 x i64> %bases, i64 %imm, i32 %prfop)
+declare void @llvm.aarch64.sve.gather.prfw.nx4vi32(<vscale x 4 x i1> %Pg, <vscale x 4 x i32> %bases, i64 %imm, i32 %prfop)
+declare void @llvm.aarch64.sve.gather.prfw.nx2vi64(<vscale x 2 x i1> %Pg, <vscale x 2 x i64> %bases, i64 %imm, i32 %prfop)
+declare void @llvm.aarch64.sve.gather.prfd.nx4vi32(<vscale x 4 x i1> %Pg, <vscale x 4 x i32> %bases, i64 %imm, i32 %prfop)
+declare void @llvm.aarch64.sve.gather.prfd.nx2vi64(<vscale x 2 x i1> %Pg, <vscale x 2 x i64> %bases, i64 %imm, i32 %prfop)
More information about the llvm-commits
mailing list