[llvm] fa9439f - [AArch64][SVE] Add intrinsics for first-faulting gather loads
Andrzej Warzynski via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 27 04:56:52 PST 2020
Author: Andrzej Warzynski
Date: 2020-02-27T12:56:33Z
New Revision: fa9439fac84ea4eb4050aa1ae150c0ec2cf86c20
URL: https://github.com/llvm/llvm-project/commit/fa9439fac84ea4eb4050aa1ae150c0ec2cf86c20
DIFF: https://github.com/llvm/llvm-project/commit/fa9439fac84ea4eb4050aa1ae150c0ec2cf86c20.diff
LOG: [AArch64][SVE] Add intrinsics for first-faulting gather loads
Summary:
The following intrinsics are added:
* @llvm.aarch64.sve.ldff1.gather
* @llvm.aarch64.sve.ldff1.gather.index
* @llvm.aarch64.sve.ldff1.gather_sxtw
* @llvm.aarch64.sve.ldff1.gather.uxtw
* @llvm.aarch64.sve.ldff1.gather_sxtw.index
* @llvm.aarch64.sve.ldff1.gather.uxtw.index
* @llvm.aarch64.sve.ldff1.gather.scalar.offset
Although this patch is quite substantial, the vast majority of the
implementation is just a 'copy & paste' of the implementation of regular
gather loads, including tests. There's only a handful of new
definitions:
* AArch64ISD nodes defined in AArch64ISelLowering.h (e.g. GLDFF1)
* Seleciton DAG Types in AArch64SVEInstrInfo.td (e.g.
AArch64ldff1_gather)
* intrinsics in IntrinsicsAArch64.td (e.g. aarch64_sve_ldff1_gather)
* Pseudo instructions in SVEInstrFormats.td to workaround the issue of
use-before-def for the FFR register.
Reviewed By: sdesmalen
Differential Revision: https://reviews.llvm.org/D75128
Added:
llvm/test/CodeGen/AArch64/sve-intrinsics-ff-gather-loads-32bit-scaled-offsets.ll
llvm/test/CodeGen/AArch64/sve-intrinsics-ff-gather-loads-32bit-unscaled-offsets.ll
llvm/test/CodeGen/AArch64/sve-intrinsics-ff-gather-loads-64bit-scaled-offset.ll
llvm/test/CodeGen/AArch64/sve-intrinsics-ff-gather-loads-64bit-unscaled-offset.ll
llvm/test/CodeGen/AArch64/sve-intrinsics-ff-gather-loads-vector-base-imm-offset.ll
llvm/test/CodeGen/AArch64/sve-intrinsics-ff-gather-loads-vector-base-scalar-offset.ll
Modified:
llvm/include/llvm/IR/IntrinsicsAArch64.td
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/lib/Target/AArch64/AArch64ISelLowering.h
llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
llvm/lib/Target/AArch64/SVEInstrFormats.td
Removed:
################################################################################
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 730241b45e30..ac5d5306cb47 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -1734,6 +1734,32 @@ def int_aarch64_sve_ld1_gather_uxtw_index : AdvSIMD_GatherLoad_SV_32b_Offsets_In
def int_aarch64_sve_ld1_gather_scalar_offset : AdvSIMD_GatherLoad_VS_Intrinsic;
+
+//
+// First-faulting gather loads: scalar base + vector offsets
+//
+
+// 64 bit unscalled offsets
+def int_aarch64_sve_ldff1_gather : AdvSIMD_GatherLoad_SV_64b_Offsets_Intrinsic;
+
+// 64 bit scaled offsets
+def int_aarch64_sve_ldff1_gather_index : AdvSIMD_GatherLoad_SV_64b_Offsets_Intrinsic;
+
+// 32 bit unscaled offsets, sign (sxtw) or zero (uxtw) extended to 64 bits
+def int_aarch64_sve_ldff1_gather_sxtw : AdvSIMD_GatherLoad_SV_32b_Offsets_Intrinsic;
+def int_aarch64_sve_ldff1_gather_uxtw : AdvSIMD_GatherLoad_SV_32b_Offsets_Intrinsic;
+
+// 32 bit scaled offsets, sign (sxtw) or zero (uxtw) extended to 64 bits
+def int_aarch64_sve_ldff1_gather_sxtw_index : AdvSIMD_GatherLoad_SV_32b_Offsets_Intrinsic;
+def int_aarch64_sve_ldff1_gather_uxtw_index : AdvSIMD_GatherLoad_SV_32b_Offsets_Intrinsic;
+
+//
+// First-faulting gather loads: vector base + scalar offset
+//
+
+def int_aarch64_sve_ldff1_gather_scalar_offset : AdvSIMD_GatherLoad_VS_Intrinsic;
+
+
//
// Scatter stores: scalar base + vector offsets
//
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index a3305db99558..2346ce5a0965 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1422,6 +1422,22 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
case AArch64ISD::GLD1S_SXTW_SCALED: return "AArch64ISD::GLD1S_SXTW_SCALED";
case AArch64ISD::GLD1S_UXTW_SCALED: return "AArch64ISD::GLD1S_UXTW_SCALED";
case AArch64ISD::GLD1S_IMM: return "AArch64ISD::GLD1S_IMM";
+ case AArch64ISD::GLDFF1: return "AArch64ISD::GLDFF1";
+ case AArch64ISD::GLDFF1_SCALED: return "AArch64ISD::GLDFF1_SCALED";
+ case AArch64ISD::GLDFF1_SXTW: return "AArch64ISD::GLDFF1_SXTW";
+ case AArch64ISD::GLDFF1_UXTW: return "AArch64ISD::GLDFF1_UXTW";
+ case AArch64ISD::GLDFF1_SXTW_SCALED:return "AArch64ISD::GLDFF1_SXTW_SCALED";
+ case AArch64ISD::GLDFF1_UXTW_SCALED:return "AArch64ISD::GLDFF1_UXTW_SCALED";
+ case AArch64ISD::GLDFF1_IMM: return "AArch64ISD::GLDFF1_IMM";
+ case AArch64ISD::GLDFF1S: return "AArch64ISD::GLDFF1S";
+ case AArch64ISD::GLDFF1S_SCALED: return "AArch64ISD::GLDFF1S_SCALED";
+ case AArch64ISD::GLDFF1S_SXTW: return "AArch64ISD::GLDFF1S_SXTW";
+ case AArch64ISD::GLDFF1S_UXTW: return "AArch64ISD::GLDFF1S_UXTW";
+ case AArch64ISD::GLDFF1S_SXTW_SCALED:
+ return "AArch64ISD::GLDFF1S_SXTW_SCALED";
+ case AArch64ISD::GLDFF1S_UXTW_SCALED:
+ return "AArch64ISD::GLDFF1S_UXTW_SCALED";
+ case AArch64ISD::GLDFF1S_IMM: return "AArch64ISD::GLDFF1S_IMM";
case AArch64ISD::SST1: return "AArch64ISD::SST1";
case AArch64ISD::SST1_SCALED: return "AArch64ISD::SST1_SCALED";
case AArch64ISD::SST1_SXTW: return "AArch64ISD::SST1_SXTW";
@@ -10434,6 +10450,13 @@ static SDValue performSVEAndCombine(SDNode *N,
case AArch64ISD::GLD1_UXTW:
case AArch64ISD::GLD1_UXTW_SCALED:
case AArch64ISD::GLD1_IMM:
+ case AArch64ISD::GLDFF1:
+ case AArch64ISD::GLDFF1_SCALED:
+ case AArch64ISD::GLDFF1_SXTW:
+ case AArch64ISD::GLDFF1_SXTW_SCALED:
+ case AArch64ISD::GLDFF1_UXTW:
+ case AArch64ISD::GLDFF1_UXTW_SCALED:
+ case AArch64ISD::GLDFF1_IMM:
MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
break;
default:
@@ -12707,13 +12730,13 @@ static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG,
// vector of offsets (that fits into one register)
SDValue Offset = N->getOperand(4);
- // GLD1_IMM requires that the offset is an immediate that is:
+ // GLD{FF}1_IMM requires that the offset is an immediate that is:
// * a multiple of #SizeInBytes,
// * in the range [0, 31 x #SizeInBytes],
// where #SizeInBytes is the size in bytes of the loaded items. For
// immediates outside that range and non-immediate scalar offsets use GLD1 or
// GLD1_UXTW instead.
- if (Opcode == AArch64ISD::GLD1_IMM) {
+ if (Opcode == AArch64ISD::GLD1_IMM || Opcode == AArch64ISD::GLDFF1_IMM) {
uint64_t MaxIndex = 31;
uint64_t RetElSize = RetElVT.getStoreSize().getKnownMinSize();
@@ -12722,9 +12745,11 @@ static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG,
OffsetConst->getZExtValue() > MaxIndex * RetElSize ||
OffsetConst->getZExtValue() % RetElSize) {
if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
- Opcode = AArch64ISD::GLD1_UXTW;
+ Opcode = (Opcode == AArch64ISD::GLD1_IMM) ? AArch64ISD::GLD1_UXTW
+ : AArch64ISD::GLDFF1_UXTW;
else
- Opcode = AArch64ISD::GLD1;
+ Opcode = (Opcode == AArch64ISD::GLD1_IMM) ? AArch64ISD::GLD1
+ : AArch64ISD::GLDFF1;
std::swap(Base, Offset);
}
@@ -12813,6 +12838,27 @@ performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
case AArch64ISD::GLD1_IMM:
NewOpc = AArch64ISD::GLD1S_IMM;
break;
+ case AArch64ISD::GLDFF1:
+ NewOpc = AArch64ISD::GLDFF1S;
+ break;
+ case AArch64ISD::GLDFF1_SCALED:
+ NewOpc = AArch64ISD::GLDFF1S_SCALED;
+ break;
+ case AArch64ISD::GLDFF1_SXTW:
+ NewOpc = AArch64ISD::GLDFF1S_SXTW;
+ break;
+ case AArch64ISD::GLDFF1_SXTW_SCALED:
+ NewOpc = AArch64ISD::GLDFF1S_SXTW_SCALED;
+ break;
+ case AArch64ISD::GLDFF1_UXTW:
+ NewOpc = AArch64ISD::GLDFF1S_UXTW;
+ break;
+ case AArch64ISD::GLDFF1_UXTW_SCALED:
+ NewOpc = AArch64ISD::GLDFF1S_UXTW_SCALED;
+ break;
+ case AArch64ISD::GLDFF1_IMM:
+ NewOpc = AArch64ISD::GLDFF1S_IMM;
+ break;
default:
return SDValue();
}
@@ -12950,6 +12996,24 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
/*OnlyPackedOffsets=*/false);
case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_IMM);
+ case Intrinsic::aarch64_sve_ldff1_gather:
+ return performGatherLoadCombine(N, DAG, AArch64ISD::GLDFF1);
+ case Intrinsic::aarch64_sve_ldff1_gather_index:
+ return performGatherLoadCombine(N, DAG, AArch64ISD::GLDFF1_SCALED);
+ case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
+ return performGatherLoadCombine(N, DAG, AArch64ISD::GLDFF1_SXTW,
+ /*OnlyPackedOffsets=*/false);
+ case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
+ return performGatherLoadCombine(N, DAG, AArch64ISD::GLDFF1_UXTW,
+ /*OnlyPackedOffsets=*/false);
+ case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
+ return performGatherLoadCombine(N, DAG, AArch64ISD::GLDFF1_SXTW_SCALED,
+ /*OnlyPackedOffsets=*/false);
+ case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
+ return performGatherLoadCombine(N, DAG, AArch64ISD::GLDFF1_UXTW_SCALED,
+ /*OnlyPackedOffsets=*/false);
+ case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
+ return performGatherLoadCombine(N, DAG, AArch64ISD::GLDFF1_IMM);
case Intrinsic::aarch64_sve_st1_scatter:
return performScatterStoreCombine(N, DAG, AArch64ISD::SST1);
case Intrinsic::aarch64_sve_st1_scatter_index:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index fecc78f77d9c..c5914b1f90b0 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -242,6 +242,25 @@ enum NodeType : unsigned {
GLD1S_UXTW_SCALED,
GLD1S_SXTW_SCALED,
GLD1S_IMM,
+
+ // Unsigned gather loads.
+ GLDFF1,
+ GLDFF1_SCALED,
+ GLDFF1_UXTW,
+ GLDFF1_SXTW,
+ GLDFF1_UXTW_SCALED,
+ GLDFF1_SXTW_SCALED,
+ GLDFF1_IMM,
+
+ // Signed gather loads.
+ GLDFF1S,
+ GLDFF1S_SCALED,
+ GLDFF1S_UXTW,
+ GLDFF1S_SXTW,
+ GLDFF1S_UXTW_SCALED,
+ GLDFF1S_SXTW_SCALED,
+ GLDFF1S_IMM,
+
// Scatter store
SST1,
SST1_SCALED,
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 3a360d07bc56..3687b3ad717a 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -53,6 +53,22 @@ def AArch64ld1s_gather_uxtw_scaled : SDNode<"AArch64ISD::GLD1S_UXTW_SCALED", SDT
def AArch64ld1s_gather_sxtw_scaled : SDNode<"AArch64ISD::GLD1S_SXTW_SCALED", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
def AArch64ld1s_gather_imm : SDNode<"AArch64ISD::GLD1S_IMM", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
+def AArch64ldff1_gather : SDNode<"AArch64ISD::GLDFF1", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
+def AArch64ldff1_gather_scaled : SDNode<"AArch64ISD::GLDFF1_SCALED", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
+def AArch64ldff1_gather_uxtw : SDNode<"AArch64ISD::GLDFF1_UXTW", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
+def AArch64ldff1_gather_sxtw : SDNode<"AArch64ISD::GLDFF1_SXTW", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
+def AArch64ldff1_gather_uxtw_scaled : SDNode<"AArch64ISD::GLDFF1_UXTW_SCALED", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
+def AArch64ldff1_gather_sxtw_scaled : SDNode<"AArch64ISD::GLDFF1_SXTW_SCALED", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
+def AArch64ldff1_gather_imm : SDNode<"AArch64ISD::GLDFF1_IMM", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
+
+def AArch64ldff1s_gather : SDNode<"AArch64ISD::GLDFF1S", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
+def AArch64ldff1s_gather_scaled : SDNode<"AArch64ISD::GLDFF1S_SCALED", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
+def AArch64ldff1s_gather_uxtw : SDNode<"AArch64ISD::GLDFF1S_UXTW", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
+def AArch64ldff1s_gather_sxtw : SDNode<"AArch64ISD::GLDFF1S_SXTW", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
+def AArch64ldff1s_gather_uxtw_scaled : SDNode<"AArch64ISD::GLDFF1S_UXTW_SCALED", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
+def AArch64ldff1s_gather_sxtw_scaled : SDNode<"AArch64ISD::GLDFF1S_SXTW_SCALED", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
+def AArch64ldff1s_gather_imm : SDNode<"AArch64ISD::GLDFF1S_IMM", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
+
// Scatter stores - node definitions
//
def SDT_AArch64_SCATTER_SV : SDTypeProfile<0, 5, [
@@ -581,114 +597,114 @@ let Predicates = [HasSVE] in {
// Gathers using unscaled 32-bit offsets, e.g.
// ld1h z0.s, p0/z, [x0, z0.s, uxtw]
defm GLD1SB_S : sve_mem_32b_gld_vs_32_unscaled<0b0000, "ld1sb", AArch64ld1s_gather_sxtw, AArch64ld1s_gather_uxtw, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>;
- defm GLDFF1SB_S : sve_mem_32b_gld_vs_32_unscaled<0b0001, "ldff1sb", null_frag, null_frag, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>;
+ defm GLDFF1SB_S : sve_mem_32b_gld_vs_32_unscaled<0b0001, "ldff1sb", AArch64ldff1s_gather_sxtw, AArch64ldff1s_gather_uxtw, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>;
defm GLD1B_S : sve_mem_32b_gld_vs_32_unscaled<0b0010, "ld1b", AArch64ld1_gather_sxtw, AArch64ld1_gather_uxtw, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>;
- defm GLDFF1B_S : sve_mem_32b_gld_vs_32_unscaled<0b0011, "ldff1b", null_frag, null_frag, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>;
- defm GLD1SH_S : sve_mem_32b_gld_vs_32_unscaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw, AArch64ld1s_gather_uxtw, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>;
- defm GLDFF1SH_S : sve_mem_32b_gld_vs_32_unscaled<0b0101, "ldff1sh", null_frag, null_frag, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>;
- defm GLD1H_S : sve_mem_32b_gld_vs_32_unscaled<0b0110, "ld1h", AArch64ld1_gather_sxtw, AArch64ld1_gather_uxtw, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>;
- defm GLDFF1H_S : sve_mem_32b_gld_vs_32_unscaled<0b0111, "ldff1h", null_frag, null_frag, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>;
- defm GLD1W : sve_mem_32b_gld_vs_32_unscaled<0b1010, "ld1w", AArch64ld1_gather_sxtw, AArch64ld1_gather_uxtw, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i32>;
- defm GLDFF1W : sve_mem_32b_gld_vs_32_unscaled<0b1011, "ldff1w", null_frag, null_frag, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i32>;
+ defm GLDFF1B_S : sve_mem_32b_gld_vs_32_unscaled<0b0011, "ldff1b", AArch64ldff1_gather_sxtw, AArch64ldff1_gather_uxtw, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>;
+ defm GLD1SH_S : sve_mem_32b_gld_vs_32_unscaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw, AArch64ld1s_gather_uxtw, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>;
+ defm GLDFF1SH_S : sve_mem_32b_gld_vs_32_unscaled<0b0101, "ldff1sh", AArch64ldff1s_gather_sxtw, AArch64ldff1s_gather_uxtw, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>;
+ defm GLD1H_S : sve_mem_32b_gld_vs_32_unscaled<0b0110, "ld1h", AArch64ld1_gather_sxtw, AArch64ld1_gather_uxtw, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>;
+ defm GLDFF1H_S : sve_mem_32b_gld_vs_32_unscaled<0b0111, "ldff1h", AArch64ldff1_gather_sxtw, AArch64ldff1_gather_uxtw, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>;
+ defm GLD1W : sve_mem_32b_gld_vs_32_unscaled<0b1010, "ld1w", AArch64ld1_gather_sxtw, AArch64ld1_gather_uxtw, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i32>;
+ defm GLDFF1W : sve_mem_32b_gld_vs_32_unscaled<0b1011, "ldff1w", AArch64ldff1_gather_sxtw, AArch64ldff1_gather_uxtw, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i32>;
// Gathers using scaled 32-bit offsets, e.g.
// ld1h z0.s, p0/z, [x0, z0.s, uxtw #1]
- defm GLD1SH_S : sve_mem_32b_gld_sv_32_scaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw_scaled, AArch64ld1s_gather_uxtw_scaled, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>;
- defm GLDFF1SH_S : sve_mem_32b_gld_sv_32_scaled<0b0101, "ldff1sh", null_frag, null_frag, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>;
- defm GLD1H_S : sve_mem_32b_gld_sv_32_scaled<0b0110, "ld1h", AArch64ld1_gather_sxtw_scaled, AArch64ld1_gather_uxtw_scaled, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>;
- defm GLDFF1H_S : sve_mem_32b_gld_sv_32_scaled<0b0111, "ldff1h", null_frag, null_frag, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>;
- defm GLD1W : sve_mem_32b_gld_sv_32_scaled<0b1010, "ld1w", AArch64ld1_gather_sxtw_scaled, AArch64ld1_gather_uxtw_scaled, ZPR32ExtSXTW32, ZPR32ExtUXTW32, nxv4i32>;
- defm GLDFF1W : sve_mem_32b_gld_sv_32_scaled<0b1011, "ldff1w", null_frag, null_frag, ZPR32ExtSXTW32, ZPR32ExtUXTW32, nxv4i32>;
+ defm GLD1SH_S : sve_mem_32b_gld_sv_32_scaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw_scaled, AArch64ld1s_gather_uxtw_scaled, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>;
+ defm GLDFF1SH_S : sve_mem_32b_gld_sv_32_scaled<0b0101, "ldff1sh", AArch64ldff1s_gather_sxtw_scaled, AArch64ldff1s_gather_uxtw_scaled, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>;
+ defm GLD1H_S : sve_mem_32b_gld_sv_32_scaled<0b0110, "ld1h", AArch64ld1_gather_sxtw_scaled, AArch64ld1_gather_uxtw_scaled, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>;
+ defm GLDFF1H_S : sve_mem_32b_gld_sv_32_scaled<0b0111, "ldff1h", AArch64ldff1_gather_sxtw_scaled, AArch64ldff1_gather_uxtw_scaled, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>;
+ defm GLD1W : sve_mem_32b_gld_sv_32_scaled<0b1010, "ld1w", AArch64ld1_gather_sxtw_scaled, AArch64ld1_gather_uxtw_scaled, ZPR32ExtSXTW32, ZPR32ExtUXTW32, nxv4i32>;
+ defm GLDFF1W : sve_mem_32b_gld_sv_32_scaled<0b1011, "ldff1w", AArch64ldff1_gather_sxtw_scaled, AArch64ldff1_gather_uxtw_scaled, ZPR32ExtSXTW32, ZPR32ExtUXTW32, nxv4i32>;
// Gathers using 32-bit pointers with scaled offset, e.g.
// ld1h z0.s, p0/z, [z0.s, #16]
- defm GLD1SB_S : sve_mem_32b_gld_vi_32_ptrs<0b0000, "ld1sb", imm0_31, AArch64ld1s_gather_imm, nxv4i8>;
- defm GLDFF1SB_S : sve_mem_32b_gld_vi_32_ptrs<0b0001, "ldff1sb", imm0_31, null_frag, nxv4i8>;
- defm GLD1B_S : sve_mem_32b_gld_vi_32_ptrs<0b0010, "ld1b", imm0_31, AArch64ld1_gather_imm, nxv4i8>;
- defm GLDFF1B_S : sve_mem_32b_gld_vi_32_ptrs<0b0011, "ldff1b", imm0_31, null_frag, nxv4i8>;
- defm GLD1SH_S : sve_mem_32b_gld_vi_32_ptrs<0b0100, "ld1sh", uimm5s2, AArch64ld1s_gather_imm, nxv4i16>;
- defm GLDFF1SH_S : sve_mem_32b_gld_vi_32_ptrs<0b0101, "ldff1sh", uimm5s2, null_frag, nxv4i16>;
- defm GLD1H_S : sve_mem_32b_gld_vi_32_ptrs<0b0110, "ld1h", uimm5s2, AArch64ld1_gather_imm, nxv4i16>;
- defm GLDFF1H_S : sve_mem_32b_gld_vi_32_ptrs<0b0111, "ldff1h", uimm5s2, null_frag, nxv4i16>;
- defm GLD1W : sve_mem_32b_gld_vi_32_ptrs<0b1010, "ld1w", uimm5s4, AArch64ld1_gather_imm, nxv4i32>;
- defm GLDFF1W : sve_mem_32b_gld_vi_32_ptrs<0b1011, "ldff1w", uimm5s4, null_frag, nxv4i32>;
+ defm GLD1SB_S : sve_mem_32b_gld_vi_32_ptrs<0b0000, "ld1sb", imm0_31, AArch64ld1s_gather_imm, nxv4i8>;
+ defm GLDFF1SB_S : sve_mem_32b_gld_vi_32_ptrs<0b0001, "ldff1sb", imm0_31, AArch64ldff1s_gather_imm, nxv4i8>;
+ defm GLD1B_S : sve_mem_32b_gld_vi_32_ptrs<0b0010, "ld1b", imm0_31, AArch64ld1_gather_imm, nxv4i8>;
+ defm GLDFF1B_S : sve_mem_32b_gld_vi_32_ptrs<0b0011, "ldff1b", imm0_31, AArch64ldff1_gather_imm, nxv4i8>;
+ defm GLD1SH_S : sve_mem_32b_gld_vi_32_ptrs<0b0100, "ld1sh", uimm5s2, AArch64ld1s_gather_imm, nxv4i16>;
+ defm GLDFF1SH_S : sve_mem_32b_gld_vi_32_ptrs<0b0101, "ldff1sh", uimm5s2, AArch64ldff1s_gather_imm, nxv4i16>;
+ defm GLD1H_S : sve_mem_32b_gld_vi_32_ptrs<0b0110, "ld1h", uimm5s2, AArch64ld1_gather_imm, nxv4i16>;
+ defm GLDFF1H_S : sve_mem_32b_gld_vi_32_ptrs<0b0111, "ldff1h", uimm5s2, AArch64ldff1_gather_imm, nxv4i16>;
+ defm GLD1W : sve_mem_32b_gld_vi_32_ptrs<0b1010, "ld1w", uimm5s4, AArch64ld1_gather_imm, nxv4i32>;
+ defm GLDFF1W : sve_mem_32b_gld_vi_32_ptrs<0b1011, "ldff1w", uimm5s4, AArch64ldff1_gather_imm, nxv4i32>;
// Gathers using 64-bit pointers with scaled offset, e.g.
// ld1h z0.d, p0/z, [z0.d, #16]
- defm GLD1SB_D : sve_mem_64b_gld_vi_64_ptrs<0b0000, "ld1sb", imm0_31, AArch64ld1s_gather_imm, nxv2i8>;
- defm GLDFF1SB_D : sve_mem_64b_gld_vi_64_ptrs<0b0001, "ldff1sb", imm0_31, null_frag, nxv2i8>;
- defm GLD1B_D : sve_mem_64b_gld_vi_64_ptrs<0b0010, "ld1b", imm0_31, AArch64ld1_gather_imm, nxv2i8>;
- defm GLDFF1B_D : sve_mem_64b_gld_vi_64_ptrs<0b0011, "ldff1b", imm0_31, null_frag, nxv2i8>;
- defm GLD1SH_D : sve_mem_64b_gld_vi_64_ptrs<0b0100, "ld1sh", uimm5s2, AArch64ld1s_gather_imm, nxv2i16>;
- defm GLDFF1SH_D : sve_mem_64b_gld_vi_64_ptrs<0b0101, "ldff1sh", uimm5s2, null_frag, nxv2i16>;
- defm GLD1H_D : sve_mem_64b_gld_vi_64_ptrs<0b0110, "ld1h", uimm5s2, AArch64ld1_gather_imm, nxv2i16>;
- defm GLDFF1H_D : sve_mem_64b_gld_vi_64_ptrs<0b0111, "ldff1h", uimm5s2, null_frag, nxv2i16>;
- defm GLD1SW_D : sve_mem_64b_gld_vi_64_ptrs<0b1000, "ld1sw", uimm5s4, AArch64ld1s_gather_imm, nxv2i32>;
- defm GLDFF1SW_D : sve_mem_64b_gld_vi_64_ptrs<0b1001, "ldff1sw", uimm5s4, null_frag, nxv2i32>;
- defm GLD1W_D : sve_mem_64b_gld_vi_64_ptrs<0b1010, "ld1w", uimm5s4, AArch64ld1_gather_imm, nxv2i32>;
- defm GLDFF1W_D : sve_mem_64b_gld_vi_64_ptrs<0b1011, "ldff1w", uimm5s4, null_frag, nxv2i32>;
- defm GLD1D : sve_mem_64b_gld_vi_64_ptrs<0b1110, "ld1d", uimm5s8, AArch64ld1_gather_imm, nxv2i64>;
- defm GLDFF1D : sve_mem_64b_gld_vi_64_ptrs<0b1111, "ldff1d", uimm5s8, null_frag, nxv2i64>;
+ defm GLD1SB_D : sve_mem_64b_gld_vi_64_ptrs<0b0000, "ld1sb", imm0_31, AArch64ld1s_gather_imm, nxv2i8>;
+ defm GLDFF1SB_D : sve_mem_64b_gld_vi_64_ptrs<0b0001, "ldff1sb", imm0_31, AArch64ldff1s_gather_imm, nxv2i8>;
+ defm GLD1B_D : sve_mem_64b_gld_vi_64_ptrs<0b0010, "ld1b", imm0_31, AArch64ld1_gather_imm, nxv2i8>;
+ defm GLDFF1B_D : sve_mem_64b_gld_vi_64_ptrs<0b0011, "ldff1b", imm0_31, AArch64ldff1_gather_imm, nxv2i8>;
+ defm GLD1SH_D : sve_mem_64b_gld_vi_64_ptrs<0b0100, "ld1sh", uimm5s2, AArch64ld1s_gather_imm, nxv2i16>;
+ defm GLDFF1SH_D : sve_mem_64b_gld_vi_64_ptrs<0b0101, "ldff1sh", uimm5s2, AArch64ldff1s_gather_imm, nxv2i16>;
+ defm GLD1H_D : sve_mem_64b_gld_vi_64_ptrs<0b0110, "ld1h", uimm5s2, AArch64ld1_gather_imm, nxv2i16>;
+ defm GLDFF1H_D : sve_mem_64b_gld_vi_64_ptrs<0b0111, "ldff1h", uimm5s2, AArch64ldff1_gather_imm, nxv2i16>;
+ defm GLD1SW_D : sve_mem_64b_gld_vi_64_ptrs<0b1000, "ld1sw", uimm5s4, AArch64ld1s_gather_imm, nxv2i32>;
+ defm GLDFF1SW_D : sve_mem_64b_gld_vi_64_ptrs<0b1001, "ldff1sw", uimm5s4, AArch64ldff1s_gather_imm, nxv2i32>;
+ defm GLD1W_D : sve_mem_64b_gld_vi_64_ptrs<0b1010, "ld1w", uimm5s4, AArch64ld1_gather_imm, nxv2i32>;
+ defm GLDFF1W_D : sve_mem_64b_gld_vi_64_ptrs<0b1011, "ldff1w", uimm5s4, AArch64ldff1_gather_imm, nxv2i32>;
+ defm GLD1D : sve_mem_64b_gld_vi_64_ptrs<0b1110, "ld1d", uimm5s8, AArch64ld1_gather_imm, nxv2i64>;
+ defm GLDFF1D : sve_mem_64b_gld_vi_64_ptrs<0b1111, "ldff1d", uimm5s8, AArch64ldff1_gather_imm, nxv2i64>;
// Gathers using unscaled 64-bit offsets, e.g.
// ld1h z0.d, p0/z, [x0, z0.d]
- defm GLD1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0000, "ld1sb", AArch64ld1s_gather, nxv2i8>;
- defm GLDFF1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0001, "ldff1sb", null_frag, nxv2i8>;
- defm GLD1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0010, "ld1b", AArch64ld1_gather, nxv2i8>;
- defm GLDFF1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0011, "ldff1b", null_frag, nxv2i8>;
- defm GLD1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0100, "ld1sh", AArch64ld1s_gather, nxv2i16>;
- defm GLDFF1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0101, "ldff1sh", null_frag, nxv2i16>;
- defm GLD1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0110, "ld1h", AArch64ld1_gather, nxv2i16>;
- defm GLDFF1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0111, "ldff1h", null_frag, nxv2i16>;
- defm GLD1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1000, "ld1sw", AArch64ld1s_gather, nxv2i32>;
- defm GLDFF1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1001, "ldff1sw", null_frag, nxv2i32>;
- defm GLD1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1010, "ld1w", AArch64ld1_gather, nxv2i32>;
- defm GLDFF1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1011, "ldff1w", null_frag, nxv2i32>;
- defm GLD1D : sve_mem_64b_gld_vs2_64_unscaled<0b1110, "ld1d", AArch64ld1_gather, nxv2i64>;
- defm GLDFF1D : sve_mem_64b_gld_vs2_64_unscaled<0b1111, "ldff1d", null_frag, nxv2i64>;
+ defm GLD1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0000, "ld1sb", AArch64ld1s_gather, nxv2i8>;
+ defm GLDFF1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0001, "ldff1sb", AArch64ldff1s_gather, nxv2i8>;
+ defm GLD1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0010, "ld1b", AArch64ld1_gather, nxv2i8>;
+ defm GLDFF1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0011, "ldff1b", AArch64ldff1_gather, nxv2i8>;
+ defm GLD1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0100, "ld1sh", AArch64ld1s_gather, nxv2i16>;
+ defm GLDFF1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0101, "ldff1sh", AArch64ldff1s_gather, nxv2i16>;
+ defm GLD1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0110, "ld1h", AArch64ld1_gather, nxv2i16>;
+ defm GLDFF1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0111, "ldff1h", AArch64ldff1_gather, nxv2i16>;
+ defm GLD1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1000, "ld1sw", AArch64ld1s_gather, nxv2i32>;
+ defm GLDFF1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1001, "ldff1sw", AArch64ldff1s_gather, nxv2i32>;
+ defm GLD1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1010, "ld1w", AArch64ld1_gather, nxv2i32>;
+ defm GLDFF1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1011, "ldff1w", AArch64ldff1_gather, nxv2i32>;
+ defm GLD1D : sve_mem_64b_gld_vs2_64_unscaled<0b1110, "ld1d", AArch64ld1_gather, nxv2i64>;
+ defm GLDFF1D : sve_mem_64b_gld_vs2_64_unscaled<0b1111, "ldff1d", AArch64ldff1_gather, nxv2i64>;
// Gathers using scaled 64-bit offsets, e.g.
// ld1h z0.d, p0/z, [x0, z0.d, lsl #1]
- defm GLD1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0100, "ld1sh", AArch64ld1s_gather_scaled, ZPR64ExtLSL16, nxv2i16>;
- defm GLDFF1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0101, "ldff1sh", null_frag, ZPR64ExtLSL16, nxv2i16>;
- defm GLD1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0110, "ld1h", AArch64ld1_gather_scaled, ZPR64ExtLSL16, nxv2i16>;
- defm GLDFF1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0111, "ldff1h", null_frag, ZPR64ExtLSL16, nxv2i16>;
- defm GLD1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1000, "ld1sw", AArch64ld1s_gather_scaled, ZPR64ExtLSL32, nxv2i32>;
- defm GLDFF1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1001, "ldff1sw", null_frag, ZPR64ExtLSL32, nxv2i32>;
- defm GLD1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1010, "ld1w", AArch64ld1_gather_scaled, ZPR64ExtLSL32, nxv2i32>;
- defm GLDFF1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1011, "ldff1w", null_frag, ZPR64ExtLSL32, nxv2i32>;
- defm GLD1D : sve_mem_64b_gld_sv2_64_scaled<0b1110, "ld1d", AArch64ld1_gather_scaled, ZPR64ExtLSL64, nxv2i64>;
- defm GLDFF1D : sve_mem_64b_gld_sv2_64_scaled<0b1111, "ldff1d", null_frag, ZPR64ExtLSL64, nxv2i64>;
+ defm GLD1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0100, "ld1sh", AArch64ld1s_gather_scaled, ZPR64ExtLSL16, nxv2i16>;
+ defm GLDFF1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0101, "ldff1sh", AArch64ldff1s_gather_scaled, ZPR64ExtLSL16, nxv2i16>;
+ defm GLD1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0110, "ld1h", AArch64ld1_gather_scaled, ZPR64ExtLSL16, nxv2i16>;
+ defm GLDFF1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0111, "ldff1h", AArch64ldff1_gather_scaled, ZPR64ExtLSL16, nxv2i16>;
+ defm GLD1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1000, "ld1sw", AArch64ld1s_gather_scaled, ZPR64ExtLSL32, nxv2i32>;
+ defm GLDFF1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1001, "ldff1sw", AArch64ldff1s_gather_scaled, ZPR64ExtLSL32, nxv2i32>;
+ defm GLD1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1010, "ld1w", AArch64ld1_gather_scaled, ZPR64ExtLSL32, nxv2i32>;
+ defm GLDFF1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1011, "ldff1w", AArch64ldff1_gather_scaled, ZPR64ExtLSL32, nxv2i32>;
+ defm GLD1D : sve_mem_64b_gld_sv2_64_scaled<0b1110, "ld1d", AArch64ld1_gather_scaled, ZPR64ExtLSL64, nxv2i64>;
+ defm GLDFF1D : sve_mem_64b_gld_sv2_64_scaled<0b1111, "ldff1d", AArch64ldff1_gather_scaled, ZPR64ExtLSL64, nxv2i64>;
// Gathers using unscaled 32-bit offsets unpacked in 64-bits elements, e.g.
// ld1h z0.d, p0/z, [x0, z0.d, uxtw]
defm GLD1SB_D : sve_mem_64b_gld_vs_32_unscaled<0b0000, "ld1sb", AArch64ld1s_gather_sxtw, AArch64ld1s_gather_uxtw, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>;
- defm GLDFF1SB_D : sve_mem_64b_gld_vs_32_unscaled<0b0001, "ldff1sb", null_frag, null_frag, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>;
+ defm GLDFF1SB_D : sve_mem_64b_gld_vs_32_unscaled<0b0001, "ldff1sb", AArch64ldff1s_gather_sxtw, AArch64ldff1s_gather_uxtw, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>;
defm GLD1B_D : sve_mem_64b_gld_vs_32_unscaled<0b0010, "ld1b", AArch64ld1_gather_sxtw, AArch64ld1_gather_uxtw, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>;
- defm GLDFF1B_D : sve_mem_64b_gld_vs_32_unscaled<0b0011, "ldff1b", null_frag, null_frag, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>;
+ defm GLDFF1B_D : sve_mem_64b_gld_vs_32_unscaled<0b0011, "ldff1b", AArch64ldff1_gather_sxtw, AArch64ldff1_gather_uxtw, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>;
defm GLD1SH_D : sve_mem_64b_gld_vs_32_unscaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw, AArch64ld1s_gather_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>;
- defm GLDFF1SH_D : sve_mem_64b_gld_vs_32_unscaled<0b0101, "ldff1sh", null_frag, null_frag, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>;
+ defm GLDFF1SH_D : sve_mem_64b_gld_vs_32_unscaled<0b0101, "ldff1sh", AArch64ldff1s_gather_sxtw, AArch64ldff1s_gather_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>;
defm GLD1H_D : sve_mem_64b_gld_vs_32_unscaled<0b0110, "ld1h", AArch64ld1_gather_sxtw, AArch64ld1_gather_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>;
- defm GLDFF1H_D : sve_mem_64b_gld_vs_32_unscaled<0b0111, "ldff1h", null_frag, null_frag, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>;
+ defm GLDFF1H_D : sve_mem_64b_gld_vs_32_unscaled<0b0111, "ldff1h", AArch64ldff1_gather_sxtw, AArch64ldff1_gather_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>;
defm GLD1SW_D : sve_mem_64b_gld_vs_32_unscaled<0b1000, "ld1sw", AArch64ld1s_gather_sxtw, AArch64ld1s_gather_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>;
- defm GLDFF1SW_D : sve_mem_64b_gld_vs_32_unscaled<0b1001, "ldff1sw", null_frag, null_frag, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>;
+ defm GLDFF1SW_D : sve_mem_64b_gld_vs_32_unscaled<0b1001, "ldff1sw", AArch64ldff1s_gather_sxtw, AArch64ldff1s_gather_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>;
defm GLD1W_D : sve_mem_64b_gld_vs_32_unscaled<0b1010, "ld1w", AArch64ld1_gather_sxtw, AArch64ld1_gather_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>;
- defm GLDFF1W_D : sve_mem_64b_gld_vs_32_unscaled<0b1011, "ldff1w", null_frag, null_frag, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>;
+ defm GLDFF1W_D : sve_mem_64b_gld_vs_32_unscaled<0b1011, "ldff1w", AArch64ldff1_gather_sxtw, AArch64ldff1_gather_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>;
defm GLD1D : sve_mem_64b_gld_vs_32_unscaled<0b1110, "ld1d", AArch64ld1_gather_sxtw, AArch64ld1_gather_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i64>;
- defm GLDFF1D : sve_mem_64b_gld_vs_32_unscaled<0b1111, "ldff1d", null_frag, null_frag, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i64>;
+ defm GLDFF1D : sve_mem_64b_gld_vs_32_unscaled<0b1111, "ldff1d", AArch64ldff1_gather_sxtw, AArch64ldff1_gather_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i64>;
// Gathers using scaled 32-bit offsets unpacked in 64-bits elements, e.g.
// ld1h z0.d, p0/z, [x0, z0.d, uxtw #1]
- defm GLD1SH_D : sve_mem_64b_gld_sv_32_scaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw_scaled, AArch64ld1s_gather_uxtw_scaled, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>;
- defm GLDFF1SH_D : sve_mem_64b_gld_sv_32_scaled<0b0101, "ldff1sh", null_frag, null_frag, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>;
- defm GLD1H_D : sve_mem_64b_gld_sv_32_scaled<0b0110, "ld1h", AArch64ld1_gather_sxtw_scaled, AArch64ld1_gather_uxtw_scaled, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>;
- defm GLDFF1H_D : sve_mem_64b_gld_sv_32_scaled<0b0111, "ldff1h", null_frag, null_frag, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>;
- defm GLD1SW_D : sve_mem_64b_gld_sv_32_scaled<0b1000, "ld1sw", AArch64ld1s_gather_sxtw_scaled, AArch64ld1s_gather_uxtw_scaled, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>;
- defm GLDFF1SW_D : sve_mem_64b_gld_sv_32_scaled<0b1001, "ldff1sw", null_frag, null_frag, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>;
- defm GLD1W_D : sve_mem_64b_gld_sv_32_scaled<0b1010, "ld1w", AArch64ld1_gather_sxtw_scaled, AArch64ld1_gather_uxtw_scaled, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>;
- defm GLDFF1W_D : sve_mem_64b_gld_sv_32_scaled<0b1011, "ldff1w", null_frag, null_frag, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>;
- defm GLD1D : sve_mem_64b_gld_sv_32_scaled<0b1110, "ld1d", AArch64ld1_gather_sxtw_scaled, AArch64ld1_gather_uxtw_scaled, ZPR64ExtSXTW64, ZPR64ExtUXTW64, nxv2i64>;
- defm GLDFF1D : sve_mem_64b_gld_sv_32_scaled<0b1111, "ldff1d", null_frag, null_frag, ZPR64ExtSXTW64, ZPR64ExtUXTW64, nxv2i64>;
+ defm GLD1SH_D : sve_mem_64b_gld_sv_32_scaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw_scaled, AArch64ld1s_gather_uxtw_scaled, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>;
+ defm GLDFF1SH_D : sve_mem_64b_gld_sv_32_scaled<0b0101, "ldff1sh", AArch64ldff1s_gather_sxtw_scaled, AArch64ldff1s_gather_uxtw_scaled, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>;
+ defm GLD1H_D : sve_mem_64b_gld_sv_32_scaled<0b0110, "ld1h", AArch64ld1_gather_sxtw_scaled, AArch64ld1_gather_uxtw_scaled, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>;
+ defm GLDFF1H_D : sve_mem_64b_gld_sv_32_scaled<0b0111, "ldff1h", AArch64ldff1_gather_sxtw_scaled, AArch64ldff1_gather_uxtw_scaled, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>;
+ defm GLD1SW_D : sve_mem_64b_gld_sv_32_scaled<0b1000, "ld1sw", AArch64ld1s_gather_sxtw_scaled, AArch64ld1s_gather_uxtw_scaled, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>;
+ defm GLDFF1SW_D : sve_mem_64b_gld_sv_32_scaled<0b1001, "ldff1sw", AArch64ldff1s_gather_sxtw_scaled, AArch64ldff1s_gather_uxtw_scaled, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>;
+ defm GLD1W_D : sve_mem_64b_gld_sv_32_scaled<0b1010, "ld1w", AArch64ld1_gather_sxtw_scaled, AArch64ld1_gather_uxtw_scaled, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>;
+ defm GLDFF1W_D : sve_mem_64b_gld_sv_32_scaled<0b1011, "ldff1w", AArch64ldff1_gather_sxtw_scaled, AArch64ldff1_gather_uxtw_scaled, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>;
+ defm GLD1D : sve_mem_64b_gld_sv_32_scaled<0b1110, "ld1d", AArch64ld1_gather_sxtw_scaled, AArch64ld1_gather_uxtw_scaled, ZPR64ExtSXTW64, ZPR64ExtUXTW64, nxv2i64>;
+ defm GLDFF1D : sve_mem_64b_gld_sv_32_scaled<0b1111, "ldff1d", AArch64ldff1_gather_sxtw_scaled, AArch64ldff1_gather_uxtw_scaled, ZPR64ExtSXTW64, ZPR64ExtUXTW64, nxv2i64>;
// Non-temporal contiguous loads (register + immediate)
defm LDNT1B_ZRI : sve_mem_cldnt_si<0b00, "ldnt1b", Z_b, ZPR8>;
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index ff0cbdc4a8b9..7cd35d818dac 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -6252,10 +6252,19 @@ multiclass sve_mem_32b_gld_sv_32_scaled<bits<4> opc, string asm,
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
(!cast<Instruction>(NAME # _SXTW_SCALED_REAL) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;
+ // We need a layer of indirection because early machine code passes balk at
+ // physical register (i.e. FFR) uses that have no previous definition.
+ let hasSideEffects = 1, hasNoSchedulingInfo = 1 in {
+ def _UXTW_SCALED : Pseudo<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), []>,
+ PseudoInstExpansion<(!cast<Instruction>(NAME # _UXTW_SCALED_REAL) Z_s:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm)>;
+ def _SXTW_SCALED : Pseudo<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), []>,
+ PseudoInstExpansion<(!cast<Instruction>(NAME # _SXTW_SCALED_REAL) Z_s:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm)>;
+ }
+
def : Pat<(nxv4i32 (uxtw_op (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$indices), vt)),
- (!cast<Instruction>(NAME # _UXTW_SCALED_REAL) PPR:$gp, GPR64sp:$base, ZPR:$indices)>;
+ (!cast<Instruction>(NAME # _UXTW_SCALED) PPR:$gp, GPR64sp:$base, ZPR:$indices)>;
def : Pat<(nxv4i32 (sxtw_op (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$indices), vt)),
- (!cast<Instruction>(NAME # _SXTW_SCALED_REAL) PPR:$gp, GPR64sp:$base, ZPR:$indices)>;
+ (!cast<Instruction>(NAME # _SXTW_SCALED) PPR:$gp, GPR64sp:$base, ZPR:$indices)>;
}
multiclass sve_mem_32b_gld_vs_32_unscaled<bits<4> opc, string asm,
@@ -6272,10 +6281,19 @@ multiclass sve_mem_32b_gld_vs_32_unscaled<bits<4> opc, string asm,
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
(!cast<Instruction>(NAME # _SXTW_REAL) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;
+ // We need a layer of indirection because early machine code passes balk at
+ // physical register (i.e. FFR) uses that have no previous definition.
+ let hasSideEffects = 1, hasNoSchedulingInfo = 1 in {
+ def _UXTW : Pseudo<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), []>,
+ PseudoInstExpansion<(!cast<Instruction>(NAME # _UXTW_REAL) Z_s:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm)>;
+ def _SXTW : Pseudo<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), []>,
+ PseudoInstExpansion<(!cast<Instruction>(NAME # _SXTW_REAL) Z_s:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm)>;
+ }
+
def : Pat<(nxv4i32 (uxtw_op (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$offsets), vt)),
- (!cast<Instruction>(NAME # _UXTW_REAL) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
+ (!cast<Instruction>(NAME # _UXTW) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
def : Pat<(nxv4i32 (sxtw_op (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$offsets), vt)),
- (!cast<Instruction>(NAME # _SXTW_REAL) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
+ (!cast<Instruction>(NAME # _SXTW) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
}
@@ -6314,8 +6332,15 @@ multiclass sve_mem_32b_gld_vi_32_ptrs<bits<4> opc, string asm, Operand imm_ty,
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
(!cast<Instruction>(NAME # _IMM_REAL) Z_s:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, 0), 1>;
+ // We need a layer of indirection because early machine code passes balk at
+ // physical register (i.e. FFR) uses that have no previous definition.
+ let hasSideEffects = 1, hasNoSchedulingInfo = 1 in {
+ def _IMM : Pseudo<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, ZPR32:$Zn, imm_ty:$imm5), []>,
+ PseudoInstExpansion<(!cast<Instruction>(NAME # _IMM_REAL) Z_s:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, imm_ty:$imm5)>;
+ }
+
def : Pat<(nxv4i32 (op (nxv4i1 PPR:$gp), (nxv4i32 ZPR:$ptrs), imm_ty:$index, vt)),
- (!cast<Instruction>(NAME # _IMM_REAL) PPR:$gp, ZPR:$ptrs, imm_ty:$index)>;
+ (!cast<Instruction>(NAME # _IMM) PPR:$gp, ZPR:$ptrs, imm_ty:$index)>;
}
class sve_mem_prfm_si<bits<2> msz, string asm>
@@ -6564,10 +6589,19 @@ multiclass sve_mem_64b_gld_sv_32_scaled<bits<4> opc, string asm,
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
(!cast<Instruction>(NAME # _SXTW_SCALED_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;
+ // We need a layer of indirection because early machine code passes balk at
+ // physical register (i.e. FFR) uses that have no previous definition.
+ let hasSideEffects = 1, hasNoSchedulingInfo = 1 in {
+ def _UXTW_SCALED : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), []>,
+ PseudoInstExpansion<(!cast<Instruction>(NAME # _UXTW_SCALED_REAL) Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm)>;
+ def _SXTW_SCALED : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), []>,
+ PseudoInstExpansion<(!cast<Instruction>(NAME # _SXTW_SCALED_REAL) Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm)>;
+ }
+
def : Pat<(nxv2i64 (uxtw_op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$indices), vt)),
- (!cast<Instruction>(NAME # _UXTW_SCALED_REAL) PPR:$gp, GPR64sp:$base, ZPR:$indices)>;
+ (!cast<Instruction>(NAME # _UXTW_SCALED) PPR:$gp, GPR64sp:$base, ZPR:$indices)>;
def : Pat<(nxv2i64 (sxtw_op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$indices), vt)),
- (!cast<Instruction>(NAME # _SXTW_SCALED_REAL) PPR:$gp, GPR64sp:$base, ZPR:$indices)>;
+ (!cast<Instruction>(NAME # _SXTW_SCALED) PPR:$gp, GPR64sp:$base, ZPR:$indices)>;
}
multiclass sve_mem_64b_gld_vs_32_unscaled<bits<4> opc, string asm,
@@ -6584,10 +6618,19 @@ multiclass sve_mem_64b_gld_vs_32_unscaled<bits<4> opc, string asm,
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
(!cast<Instruction>(NAME # _SXTW_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;
+ // We need a layer of indirection because early machine code passes balk at
+ // physical register (i.e. FFR) uses that have no previous definition.
+ let hasSideEffects = 1, hasNoSchedulingInfo = 1 in {
+ def _UXTW : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), []>,
+ PseudoInstExpansion<(!cast<Instruction>(NAME # _UXTW_REAL) Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm)>;
+ def _SXTW : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), []>,
+ PseudoInstExpansion<(!cast<Instruction>(NAME # _SXTW_REAL) Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm)>;
+ }
+
def : Pat<(nxv2i64 (uxtw_op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt)),
- (!cast<Instruction>(NAME # _UXTW_REAL) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
+ (!cast<Instruction>(NAME # _UXTW) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
def : Pat<(nxv2i64 (sxtw_op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt)),
- (!cast<Instruction>(NAME # _SXTW_REAL) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
+ (!cast<Instruction>(NAME # _SXTW) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
}
multiclass sve_mem_64b_gld_sv2_64_scaled<bits<4> opc, string asm,
@@ -6598,8 +6641,15 @@ multiclass sve_mem_64b_gld_sv2_64_scaled<bits<4> opc, string asm,
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
(!cast<Instruction>(NAME # _SCALED_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm), 0>;
+ // We need a layer of indirection because early machine code passes balk at
+ // physical register (i.e. FFR) uses that have no previous definition.
+ let hasSideEffects = 1, hasNoSchedulingInfo = 1 in {
+ def _SCALED : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm), []>,
+ PseudoInstExpansion<(!cast<Instruction>(NAME # _SCALED_REAL) Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm)>;
+ }
+
def : Pat<(nxv2i64 (op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$indices), vt)),
- (!cast<Instruction>(NAME # _SCALED_REAL) PPR:$gp, GPR64sp:$base, ZPR:$indices)>;
+ (!cast<Instruction>(NAME # _SCALED) PPR:$gp, GPR64sp:$base, ZPR:$indices)>;
}
multiclass sve_mem_64b_gld_vs2_64_unscaled<bits<4> opc, string asm,
@@ -6609,8 +6659,15 @@ multiclass sve_mem_64b_gld_vs2_64_unscaled<bits<4> opc, string asm,
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
(!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm), 0>;
+ // We need a layer of indirection because early machine code passes balk at
+ // physical register (i.e. FFR) uses that have no previous definition.
+ let hasSideEffects = 1, hasNoSchedulingInfo = 1 in {
+ def "" : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm), []>,
+ PseudoInstExpansion<(!cast<Instruction>(NAME # _REAL) Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm)>;
+ }
+
def : Pat<(nxv2i64 (op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt)),
- (!cast<Instruction>(NAME # _REAL) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
+ (!cast<Instruction>(NAME) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
}
class sve_mem_64b_gld_vi<bits<4> opc, string asm, Operand imm_ty>
@@ -6648,8 +6705,15 @@ multiclass sve_mem_64b_gld_vi_64_ptrs<bits<4> opc, string asm, Operand imm_ty,
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
(!cast<Instruction>(NAME # _IMM_REAL) Z_d:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, 0), 1>;
+ // We need a layer of indirection because early machine code passes balk at
+ // physical register (i.e. FFR) uses that have no previous definition.
+ let hasSideEffects = 1, hasNoSchedulingInfo = 1 in {
+ def _IMM : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, ZPR64:$Zn, imm_ty:$imm5), []>,
+ PseudoInstExpansion<(!cast<Instruction>(NAME # _IMM_REAL) Z_d:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, imm_ty:$imm5)>;
+ }
+
def : Pat<(nxv2i64 (op (nxv2i1 PPR:$gp), (nxv2i64 ZPR:$ptrs), imm_ty:$index, vt)),
- (!cast<Instruction>(NAME # _IMM_REAL) PPR:$gp, ZPR:$ptrs, imm_ty:$index)>;
+ (!cast<Instruction>(NAME # _IMM) PPR:$gp, ZPR:$ptrs, imm_ty:$index)>;
}
// bit lsl is '0' if the offsets are extended (uxtw/sxtw), '1' if shifted (lsl)
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-ff-gather-loads-32bit-scaled-offsets.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-ff-gather-loads-32bit-scaled-offsets.ll
new file mode 100644
index 000000000000..a86da9594a21
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-ff-gather-loads-32bit-scaled-offsets.ll
@@ -0,0 +1,255 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+
+;
+; LDFF1H, LDFF1W, LDFF1D: base + 32-bit scaled offset, sign (sxtw) or zero (uxtw)
+; extended to 64 bits
+; e.g. ldff1h z0.d, p0/z, [x0, z0.d, uxtw #1]
+;
+
+; LDFF1H
+define <vscale x 4 x i32> @gldff1h_s_uxtw_index(<vscale x 4 x i1> %pg, i16* %base, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: gldff1h_s_uxtw_index:
+; CHECK: ldff1h { z0.s }, p0/z, [x0, z0.s, uxtw #1]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ldff1.gather.uxtw.index.nxv4i16(<vscale x 4 x i1> %pg,
+ i16* %base,
+ <vscale x 4 x i32> %b)
+ %res = zext <vscale x 4 x i16> %load to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 4 x i32> @gldff1h_s_sxtw_index(<vscale x 4 x i1> %pg, i16* %base, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: gldff1h_s_sxtw_index:
+; CHECK: ldff1h { z0.s }, p0/z, [x0, z0.s, sxtw #1]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ldff1.gather.sxtw.index.nxv4i16(<vscale x 4 x i1> %pg,
+ i16* %base,
+ <vscale x 4 x i32> %b)
+ %res = zext <vscale x 4 x i16> %load to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 2 x i64> @gldff1h_d_uxtw_index(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i32> %b) {
+; CHECK-LABEL: gldff1h_d_uxtw_index:
+; CHECK: ldff1h { z0.d }, p0/z, [x0, z0.d, uxtw #1]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ldff1.gather.uxtw.index.nxv2i16(<vscale x 2 x i1> %pg,
+ i16* %base,
+ <vscale x 2 x i32> %b)
+ %res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gldff1h_d_sxtw_index(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i32> %b) {
+; CHECK-LABEL: gldff1h_d_sxtw_index:
+; CHECK: ldff1h { z0.d }, p0/z, [x0, z0.d, sxtw #1]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ldff1.gather.sxtw.index.nxv2i16(<vscale x 2 x i1> %pg,
+ i16* %base,
+ <vscale x 2 x i32> %b)
+ %res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+; LDFF1W
+define <vscale x 4 x i32> @gldff1w_s_uxtw_index(<vscale x 4 x i1> %pg, i32* %base, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: gldff1w_s_uxtw_index:
+; CHECK: ldff1w { z0.s }, p0/z, [x0, z0.s, uxtw #2]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x i32> @llvm.aarch64.sve.ldff1.gather.uxtw.index.nxv4i32(<vscale x 4 x i1> %pg,
+ i32* %base,
+ <vscale x 4 x i32> %b)
+ ret <vscale x 4 x i32> %load
+}
+
+define <vscale x 4 x i32> @gldff1w_s_sxtw_index(<vscale x 4 x i1> %pg, i32* %base, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: gldff1w_s_sxtw_index:
+; CHECK: ldff1w { z0.s }, p0/z, [x0, z0.s, sxtw #2]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x i32> @llvm.aarch64.sve.ldff1.gather.sxtw.index.nxv4i32(<vscale x 4 x i1> %pg,
+ i32* %base,
+ <vscale x 4 x i32> %b)
+ ret <vscale x 4 x i32> %load
+}
+
+define <vscale x 2 x i64> @gldff1w_d_uxtw_index(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i32> %b) {
+; CHECK-LABEL: gldff1w_d_uxtw_index:
+; CHECK: ldff1w { z0.d }, p0/z, [x0, z0.d, uxtw #2]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ldff1.gather.uxtw.index.nxv2i32(<vscale x 2 x i1> %pg,
+ i32* %base,
+ <vscale x 2 x i32> %b)
+ %res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gldff1w_d_sxtw_index(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i32> %b) {
+; CHECK-LABEL: gldff1w_d_sxtw_index:
+; CHECK: ldff1w { z0.d }, p0/z, [x0, z0.d, sxtw #2]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ldff1.gather.sxtw.index.nxv2i32(<vscale x 2 x i1> %pg,
+ i32* %base,
+ <vscale x 2 x i32> %b)
+ %res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 4 x float> @gldff1w_s_uxtw_index_float(<vscale x 4 x i1> %pg, float* %base, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: gldff1w_s_uxtw_index_float:
+; CHECK: ldff1w { z0.s }, p0/z, [x0, z0.s, uxtw #2]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x float> @llvm.aarch64.sve.ldff1.gather.uxtw.index.nxv4f32(<vscale x 4 x i1> %pg,
+ float* %base,
+ <vscale x 4 x i32> %b)
+ ret <vscale x 4 x float> %load
+}
+
+define <vscale x 4 x float> @gldff1w_s_sxtw_index_float(<vscale x 4 x i1> %pg, float* %base, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: gldff1w_s_sxtw_index_float:
+; CHECK: ldff1w { z0.s }, p0/z, [x0, z0.s, sxtw #2]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x float> @llvm.aarch64.sve.ldff1.gather.sxtw.index.nxv4f32(<vscale x 4 x i1> %pg,
+ float* %base,
+ <vscale x 4 x i32> %b)
+ ret <vscale x 4 x float> %load
+}
+
+; LDFF1D
+define <vscale x 2 x i64> @gldff1d_s_uxtw_index(<vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i32> %b) {
+; CHECK-LABEL: gldff1d_s_uxtw_index:
+; CHECK: ldff1d { z0.d }, p0/z, [x0, z0.d, uxtw #3]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ldff1.gather.uxtw.index.nxv2i64(<vscale x 2 x i1> %pg,
+ i64* %base,
+ <vscale x 2 x i32> %b)
+ ret <vscale x 2 x i64> %load
+}
+
+define <vscale x 2 x i64> @gldff1d_sxtw_index(<vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i32> %b) {
+; CHECK-LABEL: gldff1d_sxtw_index:
+; CHECK: ldff1d { z0.d }, p0/z, [x0, z0.d, sxtw #3]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ldff1.gather.sxtw.index.nxv2i64(<vscale x 2 x i1> %pg,
+ i64* %base,
+ <vscale x 2 x i32> %b)
+ ret <vscale x 2 x i64> %load
+}
+
+define <vscale x 2 x double> @gldff1d_uxtw_index_double(<vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i32> %b) {
+; CHECK-LABEL: gldff1d_uxtw_index_double:
+; CHECK: ldff1d { z0.d }, p0/z, [x0, z0.d, uxtw #3]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x double> @llvm.aarch64.sve.ldff1.gather.uxtw.index.nxv2f64(<vscale x 2 x i1> %pg,
+ double* %base,
+ <vscale x 2 x i32> %b)
+ ret <vscale x 2 x double> %load
+}
+
+define <vscale x 2 x double> @gldff1d_sxtw_index_double(<vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i32> %b) {
+; CHECK-LABEL: gldff1d_sxtw_index_double:
+; CHECK: ldff1d { z0.d }, p0/z, [x0, z0.d, sxtw #3]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x double> @llvm.aarch64.sve.ldff1.gather.sxtw.index.nxv2f64(<vscale x 2 x i1> %pg,
+ double* %base,
+ <vscale x 2 x i32> %b)
+ ret <vscale x 2 x double> %load
+}
+
+;
+; LDFF1SH, LDFF1SW, LDFF1SD: base + 32-bit scaled offset, sign (sxtw) or zero (uxtw)
+; extended to 64 bits
+; e.g. ldff1sh z0.d, p0/z, [x0, z0.d, uxtw #1]
+;
+
+; LDFF1SH
+define <vscale x 4 x i32> @gldff1sh_s_uxtw_index(<vscale x 4 x i1> %pg, i16* %base, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: gldff1sh_s_uxtw_index:
+; CHECK: ldff1sh { z0.s }, p0/z, [x0, z0.s, uxtw #1]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ldff1.gather.uxtw.index.nxv4i16(<vscale x 4 x i1> %pg,
+ i16* %base,
+ <vscale x 4 x i32> %b)
+ %res = sext <vscale x 4 x i16> %load to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 4 x i32> @gldff1sh_s_sxtw_index(<vscale x 4 x i1> %pg, i16* %base, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: gldff1sh_s_sxtw_index:
+; CHECK: ldff1sh { z0.s }, p0/z, [x0, z0.s, sxtw #1]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ldff1.gather.sxtw.index.nxv4i16(<vscale x 4 x i1> %pg,
+ i16* %base,
+ <vscale x 4 x i32> %b)
+ %res = sext <vscale x 4 x i16> %load to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 2 x i64> @gldff1sh_d_uxtw_index(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i32> %b) {
+; CHECK-LABEL: gldff1sh_d_uxtw_index:
+; CHECK: ldff1sh { z0.d }, p0/z, [x0, z0.d, uxtw #1]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ldff1.gather.uxtw.index.nxv2i16(<vscale x 2 x i1> %pg,
+ i16* %base,
+ <vscale x 2 x i32> %b)
+ %res = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gldff1sh_d_sxtw_index(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i32> %b) {
+; CHECK-LABEL: gldff1sh_d_sxtw_index:
+; CHECK: ldff1sh { z0.d }, p0/z, [x0, z0.d, sxtw #1]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ldff1.gather.sxtw.index.nxv2i16(<vscale x 2 x i1> %pg,
+ i16* %base,
+ <vscale x 2 x i32> %b)
+ %res = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+; LDFF1SW
+define <vscale x 2 x i64> @gldff1sw_d_uxtw_index(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i32> %b) {
+; CHECK-LABEL: gldff1sw_d_uxtw_index:
+; CHECK: ldff1sw { z0.d }, p0/z, [x0, z0.d, uxtw #2]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ldff1.gather.uxtw.index.nxv2i32(<vscale x 2 x i1> %pg,
+ i32* %base,
+ <vscale x 2 x i32> %b)
+ %res = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gldff1sw_d_sxtw_index(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i32> %b) {
+; CHECK-LABEL: gldff1sw_d_sxtw_index:
+; CHECK: ldff1sw { z0.d }, p0/z, [x0, z0.d, sxtw #2]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ldff1.gather.sxtw.index.nxv2i32(<vscale x 2 x i1> %pg,
+ i32* %base,
+ <vscale x 2 x i32> %b)
+ %res = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+
+; LDFF1H/LDFF1SH
+declare <vscale x 4 x i16> @llvm.aarch64.sve.ldff1.gather.uxtw.index.nxv4i16(<vscale x 4 x i1>, i16*, <vscale x 4 x i32>)
+declare <vscale x 4 x i16> @llvm.aarch64.sve.ldff1.gather.sxtw.index.nxv4i16(<vscale x 4 x i1>, i16*, <vscale x 4 x i32>)
+
+declare <vscale x 2 x i16> @llvm.aarch64.sve.ldff1.gather.uxtw.index.nxv2i16(<vscale x 2 x i1>, i16*, <vscale x 2 x i32>)
+declare <vscale x 2 x i16> @llvm.aarch64.sve.ldff1.gather.sxtw.index.nxv2i16(<vscale x 2 x i1>, i16*, <vscale x 2 x i32>)
+
+; LDFF1W/LDFF1SW
+declare <vscale x 4 x i32> @llvm.aarch64.sve.ldff1.gather.uxtw.index.nxv4i32(<vscale x 4 x i1>, i32*, <vscale x 4 x i32>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.ldff1.gather.sxtw.index.nxv4i32(<vscale x 4 x i1>, i32*, <vscale x 4 x i32>)
+
+declare <vscale x 2 x i32> @llvm.aarch64.sve.ldff1.gather.uxtw.index.nxv2i32(<vscale x 2 x i1>, i32*, <vscale x 2 x i32>)
+declare <vscale x 2 x i32> @llvm.aarch64.sve.ldff1.gather.sxtw.index.nxv2i32(<vscale x 2 x i1>, i32*, <vscale x 2 x i32>)
+
+declare <vscale x 4 x float> @llvm.aarch64.sve.ldff1.gather.uxtw.index.nxv4f32(<vscale x 4 x i1>, float*, <vscale x 4 x i32>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.ldff1.gather.sxtw.index.nxv4f32(<vscale x 4 x i1>, float*, <vscale x 4 x i32>)
+
+; LDFF1D
+declare <vscale x 2 x i64> @llvm.aarch64.sve.ldff1.gather.uxtw.index.nxv2i64(<vscale x 2 x i1>, i64*, <vscale x 2 x i32>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.ldff1.gather.sxtw.index.nxv2i64(<vscale x 2 x i1>, i64*, <vscale x 2 x i32>)
+
+declare <vscale x 2 x double> @llvm.aarch64.sve.ldff1.gather.uxtw.index.nxv2f64(<vscale x 2 x i1>, double*, <vscale x 2 x i32>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.ldff1.gather.sxtw.index.nxv2f64(<vscale x 2 x i1>, double*, <vscale x 2 x i32>)
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-ff-gather-loads-32bit-unscaled-offsets.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-ff-gather-loads-32bit-unscaled-offsets.ll
new file mode 100644
index 000000000000..012812fb22b0
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-ff-gather-loads-32bit-unscaled-offsets.ll
@@ -0,0 +1,348 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+
+;
+; LDFF1B, LDFF1W, LDFF1H, LDFF1D: base + 32-bit unscaled offset, sign (sxtw) or zero
+; (uxtw) extended to 64 bits.
+; e.g. ldff1h { z0.d }, p0/z, [x0, z0.d, uxtw]
+;
+
+; LDFF1B
+define <vscale x 4 x i32> @gldff1b_s_uxtw(<vscale x 4 x i1> %pg, i8* %base, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: gldff1b_s_uxtw:
+; CHECK: ldff1b { z0.s }, p0/z, [x0, z0.s, uxtw]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ldff1.gather.uxtw.nxv4i8(<vscale x 4 x i1> %pg,
+ i8* %base,
+ <vscale x 4 x i32> %b)
+ %res = zext <vscale x 4 x i8> %load to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 4 x i32> @gldff1b_s_sxtw(<vscale x 4 x i1> %pg, i8* %base, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: gldff1b_s_sxtw:
+; CHECK: ldff1b { z0.s }, p0/z, [x0, z0.s, sxtw]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ldff1.gather.sxtw.nxv4i8(<vscale x 4 x i1> %pg,
+ i8* %base,
+ <vscale x 4 x i32> %b)
+ %res = zext <vscale x 4 x i8> %load to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 2 x i64> @gldff1b_d_uxtw(<vscale x 2 x i1> %pg, i8* %base, <vscale x 2 x i32> %b) {
+; CHECK-LABEL: gldff1b_d_uxtw:
+; CHECK: ldff1b { z0.d }, p0/z, [x0, z0.d, uxtw]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ldff1.gather.uxtw.nxv2i8(<vscale x 2 x i1> %pg,
+ i8* %base,
+ <vscale x 2 x i32> %b)
+ %res = zext <vscale x 2 x i8> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gldff1b_d_sxtw(<vscale x 2 x i1> %pg, i8* %base, <vscale x 2 x i32> %b) {
+; CHECK-LABEL: gldff1b_d_sxtw:
+; CHECK: ldff1b { z0.d }, p0/z, [x0, z0.d, sxtw]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ldff1.gather.sxtw.nxv2i8(<vscale x 2 x i1> %pg,
+ i8* %base,
+ <vscale x 2 x i32> %b)
+ %res = zext <vscale x 2 x i8> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+; LDFF1H
+define <vscale x 4 x i32> @gldff1h_s_uxtw(<vscale x 4 x i1> %pg, i16* %base, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: gldff1h_s_uxtw:
+; CHECK: ldff1h { z0.s }, p0/z, [x0, z0.s, uxtw]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ldff1.gather.uxtw.nxv4i16(<vscale x 4 x i1> %pg,
+ i16* %base,
+ <vscale x 4 x i32> %b)
+ %res = zext <vscale x 4 x i16> %load to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 4 x i32> @gldff1h_s_sxtw(<vscale x 4 x i1> %pg, i16* %base, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: gldff1h_s_sxtw:
+; CHECK: ldff1h { z0.s }, p0/z, [x0, z0.s, sxtw]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ldff1.gather.sxtw.nxv4i16(<vscale x 4 x i1> %pg,
+ i16* %base,
+ <vscale x 4 x i32> %b)
+ %res = zext <vscale x 4 x i16> %load to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 2 x i64> @gldff1h_d_uxtw(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i32> %b) {
+; CHECK-LABEL: gldff1h_d_uxtw:
+; CHECK: ldff1h { z0.d }, p0/z, [x0, z0.d, uxtw]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ldff1.gather.uxtw.nxv2i16(<vscale x 2 x i1> %pg,
+ i16* %base,
+ <vscale x 2 x i32> %b)
+ %res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gldff1h_d_sxtw(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i32> %b) {
+; CHECK-LABEL: gldff1h_d_sxtw:
+; CHECK: ldff1h { z0.d }, p0/z, [x0, z0.d, sxtw]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ldff1.gather.sxtw.nxv2i16(<vscale x 2 x i1> %pg,
+ i16* %base,
+ <vscale x 2 x i32> %b)
+ %res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+; LDFF1W
+define <vscale x 4 x i32> @gldff1w_s_uxtw(<vscale x 4 x i1> %pg, i32* %base, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: gldff1w_s_uxtw:
+; CHECK: ldff1w { z0.s }, p0/z, [x0, z0.s, uxtw]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x i32> @llvm.aarch64.sve.ldff1.gather.uxtw.nxv4i32(<vscale x 4 x i1> %pg,
+ i32* %base,
+ <vscale x 4 x i32> %b)
+ ret <vscale x 4 x i32> %load
+}
+
+define <vscale x 4 x i32> @gldff1w_s_sxtw(<vscale x 4 x i1> %pg, i32* %base, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: gldff1w_s_sxtw:
+; CHECK: ldff1w { z0.s }, p0/z, [x0, z0.s, sxtw]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x i32> @llvm.aarch64.sve.ldff1.gather.sxtw.nxv4i32(<vscale x 4 x i1> %pg,
+ i32* %base,
+ <vscale x 4 x i32> %b)
+ ret <vscale x 4 x i32> %load
+}
+
+define <vscale x 2 x i64> @gldff1w_d_uxtw(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i32> %b) {
+; CHECK-LABEL: gldff1w_d_uxtw:
+; CHECK: ldff1w { z0.d }, p0/z, [x0, z0.d, uxtw]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ldff1.gather.uxtw.nxv2i32(<vscale x 2 x i1> %pg,
+ i32* %base,
+ <vscale x 2 x i32> %b)
+ %res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gldff1w_d_sxtw(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i32> %b) {
+; CHECK-LABEL: gldff1w_d_sxtw:
+; CHECK: ldff1w { z0.d }, p0/z, [x0, z0.d, sxtw]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ldff1.gather.sxtw.nxv2i32(<vscale x 2 x i1> %pg,
+ i32* %base,
+ <vscale x 2 x i32> %b)
+ %res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 4 x float> @gldff1w_s_uxtw_float(<vscale x 4 x i1> %pg, float* %base, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: gldff1w_s_uxtw_float:
+; CHECK: ldff1w { z0.s }, p0/z, [x0, z0.s, uxtw]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x float> @llvm.aarch64.sve.ldff1.gather.uxtw.nxv4f32(<vscale x 4 x i1> %pg,
+ float* %base,
+ <vscale x 4 x i32> %b)
+ ret <vscale x 4 x float> %load
+}
+
+define <vscale x 4 x float> @gldff1w_s_sxtw_float(<vscale x 4 x i1> %pg, float* %base, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: gldff1w_s_sxtw_float:
+; CHECK: ldff1w { z0.s }, p0/z, [x0, z0.s, sxtw]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x float> @llvm.aarch64.sve.ldff1.gather.sxtw.nxv4f32(<vscale x 4 x i1> %pg,
+ float* %base,
+ <vscale x 4 x i32> %b)
+ ret <vscale x 4 x float> %load
+}
+
+; LDFF1D
+define <vscale x 2 x i64> @gldff1d_d_uxtw(<vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i32> %b) {
+; CHECK-LABEL: gldff1d_d_uxtw:
+; CHECK: ldff1d { z0.d }, p0/z, [x0, z0.d, uxtw]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ldff1.gather.uxtw.nxv2i64(<vscale x 2 x i1> %pg,
+ i64* %base,
+ <vscale x 2 x i32> %b)
+ ret <vscale x 2 x i64> %load
+}
+
+define <vscale x 2 x i64> @gldff1d_d_sxtw(<vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i32> %b) {
+; CHECK-LABEL: gldff1d_d_sxtw:
+; CHECK: ldff1d { z0.d }, p0/z, [x0, z0.d, sxtw]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ldff1.gather.sxtw.nxv2i64(<vscale x 2 x i1> %pg,
+ i64* %base,
+ <vscale x 2 x i32> %b)
+ ret <vscale x 2 x i64> %load
+}
+
+define <vscale x 2 x double> @gldff1d_d_uxtw_double(<vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i32> %b) {
+; CHECK-LABEL: gldff1d_d_uxtw_double:
+; CHECK: ldff1d { z0.d }, p0/z, [x0, z0.d, uxtw]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x double> @llvm.aarch64.sve.ldff1.gather.uxtw.nxv2f64(<vscale x 2 x i1> %pg,
+ double* %base,
+ <vscale x 2 x i32> %b)
+ ret <vscale x 2 x double> %load
+}
+
+define <vscale x 2 x double> @gldff1d_d_sxtw_double(<vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i32> %b) {
+; CHECK-LABEL: gldff1d_d_sxtw_double:
+; CHECK: ldff1d { z0.d }, p0/z, [x0, z0.d, sxtw]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x double> @llvm.aarch64.sve.ldff1.gather.sxtw.nxv2f64(<vscale x 2 x i1> %pg,
+ double* %base,
+ <vscale x 2 x i32> %b)
+ ret <vscale x 2 x double> %load
+}
+
+;
+; LDFF1SB, LDFF1SW, LDFF1SH: base + 32-bit unscaled offset, sign (sxtw) or zero
+; (uxtw) extended to 64 bits.
+; e.g. ldff1sh { z0.d }, p0/z, [x0, z0.d, uxtw]
+;
+
+; LDFF1SB
+define <vscale x 4 x i32> @gldff1sb_s_uxtw(<vscale x 4 x i1> %pg, i8* %base, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: gldff1sb_s_uxtw:
+; CHECK: ldff1sb { z0.s }, p0/z, [x0, z0.s, uxtw]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ldff1.gather.uxtw.nxv4i8(<vscale x 4 x i1> %pg,
+ i8* %base,
+ <vscale x 4 x i32> %b)
+ %res = sext <vscale x 4 x i8> %load to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 4 x i32> @gldff1sb_s_sxtw(<vscale x 4 x i1> %pg, i8* %base, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: gldff1sb_s_sxtw:
+; CHECK: ldff1sb { z0.s }, p0/z, [x0, z0.s, sxtw]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ldff1.gather.sxtw.nxv4i8(<vscale x 4 x i1> %pg,
+ i8* %base,
+ <vscale x 4 x i32> %b)
+ %res = sext <vscale x 4 x i8> %load to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 2 x i64> @gldff1sb_d_uxtw(<vscale x 2 x i1> %pg, i8* %base, <vscale x 2 x i32> %b) {
+; CHECK-LABEL: gldff1sb_d_uxtw:
+; CHECK: ldff1sb { z0.d }, p0/z, [x0, z0.d, uxtw]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ldff1.gather.uxtw.nxv2i8(<vscale x 2 x i1> %pg,
+ i8* %base,
+ <vscale x 2 x i32> %b)
+ %res = sext <vscale x 2 x i8> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gldff1sb_d_sxtw(<vscale x 2 x i1> %pg, i8* %base, <vscale x 2 x i32> %b) {
+; CHECK-LABEL: gldff1sb_d_sxtw:
+; CHECK: ldff1sb { z0.d }, p0/z, [x0, z0.d, sxtw]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ldff1.gather.sxtw.nxv2i8(<vscale x 2 x i1> %pg,
+ i8* %base,
+ <vscale x 2 x i32> %b)
+ %res = sext <vscale x 2 x i8> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+; LDFF1SH
+define <vscale x 4 x i32> @gldff1sh_s_uxtw(<vscale x 4 x i1> %pg, i16* %base, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: gldff1sh_s_uxtw:
+; CHECK: ldff1sh { z0.s }, p0/z, [x0, z0.s, uxtw]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ldff1.gather.uxtw.nxv4i16(<vscale x 4 x i1> %pg,
+ i16* %base,
+ <vscale x 4 x i32> %b)
+ %res = sext <vscale x 4 x i16> %load to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 4 x i32> @gldff1sh_s_sxtw(<vscale x 4 x i1> %pg, i16* %base, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: gldff1sh_s_sxtw:
+; CHECK: ldff1sh { z0.s }, p0/z, [x0, z0.s, sxtw]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ldff1.gather.sxtw.nxv4i16(<vscale x 4 x i1> %pg,
+ i16* %base,
+ <vscale x 4 x i32> %b)
+ %res = sext <vscale x 4 x i16> %load to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 2 x i64> @gldff1sh_d_uxtw(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i32> %b) {
+; CHECK-LABEL: gldff1sh_d_uxtw:
+; CHECK: ldff1sh { z0.d }, p0/z, [x0, z0.d, uxtw]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ldff1.gather.uxtw.nxv2i16(<vscale x 2 x i1> %pg,
+ i16* %base,
+ <vscale x 2 x i32> %b)
+ %res = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gldff1sh_d_sxtw(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i32> %b) {
+; CHECK-LABEL: gldff1sh_d_sxtw:
+; CHECK: ldff1sh { z0.d }, p0/z, [x0, z0.d, sxtw]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ldff1.gather.sxtw.nxv2i16(<vscale x 2 x i1> %pg,
+ i16* %base,
+ <vscale x 2 x i32> %b)
+ %res = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+; LDFF1SW
+define <vscale x 2 x i64> @gldff1sw_d_uxtw(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i32> %b) {
+; CHECK-LABEL: gldff1sw_d_uxtw:
+; CHECK: ldff1sw { z0.d }, p0/z, [x0, z0.d, uxtw]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ldff1.gather.uxtw.nxv2i32(<vscale x 2 x i1> %pg,
+ i32* %base,
+ <vscale x 2 x i32> %b)
+ %res = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gldff1sw_d_sxtw(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i32> %b) {
+; CHECK-LABEL: gldff1sw_d_sxtw:
+; CHECK: ldff1sw { z0.d }, p0/z, [x0, z0.d, sxtw]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ldff1.gather.sxtw.nxv2i32(<vscale x 2 x i1> %pg,
+ i32* %base,
+ <vscale x 2 x i32> %b)
+ %res = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+; LDFF1B/LDFF1SB
+declare <vscale x 4 x i8> @llvm.aarch64.sve.ldff1.gather.uxtw.nxv4i8(<vscale x 4 x i1>, i8*, <vscale x 4 x i32>)
+declare <vscale x 2 x i8> @llvm.aarch64.sve.ldff1.gather.uxtw.nxv2i8(<vscale x 2 x i1>, i8*, <vscale x 2 x i32>)
+declare <vscale x 4 x i8> @llvm.aarch64.sve.ldff1.gather.sxtw.nxv4i8(<vscale x 4 x i1>, i8*, <vscale x 4 x i32>)
+declare <vscale x 2 x i8> @llvm.aarch64.sve.ldff1.gather.sxtw.nxv2i8(<vscale x 2 x i1>, i8*, <vscale x 2 x i32>)
+
+; LDFF1H/LDFF1SH
+declare <vscale x 4 x i16> @llvm.aarch64.sve.ldff1.gather.sxtw.nxv4i16(<vscale x 4 x i1>, i16*, <vscale x 4 x i32>)
+declare <vscale x 2 x i16> @llvm.aarch64.sve.ldff1.gather.sxtw.nxv2i16(<vscale x 2 x i1>, i16*, <vscale x 2 x i32>)
+declare <vscale x 4 x i16> @llvm.aarch64.sve.ldff1.gather.uxtw.nxv4i16(<vscale x 4 x i1>, i16*, <vscale x 4 x i32>)
+declare <vscale x 2 x i16> @llvm.aarch64.sve.ldff1.gather.uxtw.nxv2i16(<vscale x 2 x i1>, i16*, <vscale x 2 x i32>)
+
+; LDFF1W/LDFF1SW
+declare <vscale x 4 x i32> @llvm.aarch64.sve.ldff1.gather.sxtw.nxv4i32(<vscale x 4 x i1>, i32*, <vscale x 4 x i32>)
+declare <vscale x 2 x i32> @llvm.aarch64.sve.ldff1.gather.sxtw.nxv2i32(<vscale x 2 x i1>, i32*, <vscale x 2 x i32>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.ldff1.gather.uxtw.nxv4i32(<vscale x 4 x i1>, i32*, <vscale x 4 x i32>)
+declare <vscale x 2 x i32> @llvm.aarch64.sve.ldff1.gather.uxtw.nxv2i32(<vscale x 2 x i1>, i32*, <vscale x 2 x i32>)
+
+declare <vscale x 4 x float> @llvm.aarch64.sve.ldff1.gather.sxtw.nxv4f32(<vscale x 4 x i1>, float*, <vscale x 4 x i32>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.ldff1.gather.uxtw.nxv4f32(<vscale x 4 x i1>, float*, <vscale x 4 x i32>)
+
+; LDFF1D
+declare <vscale x 2 x i64> @llvm.aarch64.sve.ldff1.gather.sxtw.nxv2i64(<vscale x 2 x i1>, i64*, <vscale x 2 x i32>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.ldff1.gather.uxtw.nxv2i64(<vscale x 2 x i1>, i64*, <vscale x 2 x i32>)
+
+declare <vscale x 2 x double> @llvm.aarch64.sve.ldff1.gather.sxtw.nxv2f64(<vscale x 2 x i1>, double*, <vscale x 2 x i32>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.ldff1.gather.uxtw.nxv2f64(<vscale x 2 x i1>, double*, <vscale x 2 x i32>)
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-ff-gather-loads-64bit-scaled-offset.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-ff-gather-loads-64bit-scaled-offset.ll
new file mode 100644
index 000000000000..4d5267356081
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-ff-gather-loads-64bit-scaled-offset.ll
@@ -0,0 +1,80 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+
+;
+; LDFF1H, LDFF1W, LDFF1D: base + 64-bit scaled offset
+; e.g. ldff1h z0.d, p0/z, [x0, z0.d, lsl #1]
+;
+
+define <vscale x 2 x i64> @gldff1h_index(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gldff1h_index
+; CHECK: ldff1h { z0.d }, p0/z, [x0, z0.d, lsl #1]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ldff1.gather.index.nxv2i16(<vscale x 2 x i1> %pg,
+ i16* %base,
+ <vscale x 2 x i64> %b)
+ %res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gldff1w_index(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gldff1w_index
+; CHECK: ldff1w { z0.d }, p0/z, [x0, z0.d, lsl #2]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ldff1.gather.index.nxv2i32(<vscale x 2 x i1> %pg,
+ i32* %base,
+ <vscale x 2 x i64> %b)
+ %res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gldff1d_index(<vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gldff1d_index
+; CHECK: ldff1d { z0.d }, p0/z, [x0, z0.d, lsl #3]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ldff1.gather.index.nxv2i64(<vscale x 2 x i1> %pg,
+ i64* %base,
+ <vscale x 2 x i64> %b)
+ ret <vscale x 2 x i64> %load
+}
+
+define <vscale x 2 x double> @gldff1d_index_double(<vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gldff1d_index_double
+; CHECK: ldff1d { z0.d }, p0/z, [x0, z0.d, lsl #3]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x double> @llvm.aarch64.sve.ldff1.gather.index.nxv2f64(<vscale x 2 x i1> %pg,
+ double* %base,
+ <vscale x 2 x i64> %b)
+ ret <vscale x 2 x double> %load
+}
+
+;
+; LDFF1SH, LDFF1SW: base + 64-bit scaled offset
+; e.g. ldff1sh z0.d, p0/z, [x0, z0.d, lsl #1]
+;
+
+define <vscale x 2 x i64> @gldff1sh_index(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gldff1sh_index
+; CHECK: ldff1sh { z0.d }, p0/z, [x0, z0.d, lsl #1]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ldff1.gather.index.nxv2i16(<vscale x 2 x i1> %pg,
+ i16* %base,
+ <vscale x 2 x i64> %b)
+ %res = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gldff1sw_index(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gldff1sw_index
+; CHECK: ldff1sw { z0.d }, p0/z, [x0, z0.d, lsl #2]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ldff1.gather.index.nxv2i32(<vscale x 2 x i1> %pg,
+ i32* %base,
+ <vscale x 2 x i64> %b)
+ %res = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+declare <vscale x 2 x i16> @llvm.aarch64.sve.ldff1.gather.index.nxv2i16(<vscale x 2 x i1>, i16*, <vscale x 2 x i64>)
+declare <vscale x 2 x i32> @llvm.aarch64.sve.ldff1.gather.index.nxv2i32(<vscale x 2 x i1>, i32*, <vscale x 2 x i64>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.ldff1.gather.index.nxv2i64(<vscale x 2 x i1>, i64*, <vscale x 2 x i64>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.ldff1.gather.index.nxv2f64(<vscale x 2 x i1>, double*, <vscale x 2 x i64>)
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-ff-gather-loads-64bit-unscaled-offset.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-ff-gather-loads-64bit-unscaled-offset.ll
new file mode 100644
index 000000000000..570bac58cc9a
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-ff-gather-loads-64bit-unscaled-offset.ll
@@ -0,0 +1,103 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+
+;
+; LDFF1B, LDFF1W, LDFF1H, LDFF1D: base + 64-bit unscaled offset
+; e.g. ldff1h { z0.d }, p0/z, [x0, z0.d]
+;
+
+define <vscale x 2 x i64> @gldff1b_d(<vscale x 2 x i1> %pg, i8* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gldff1b_d:
+; CHECK: ldff1b { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ldff1.gather.nxv2i8(<vscale x 2 x i1> %pg,
+ i8* %base,
+ <vscale x 2 x i64> %b)
+ %res = zext <vscale x 2 x i8> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gldff1h_d(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gldff1h_d:
+; CHECK: ldff1h { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ldff1.gather.nxv2i16(<vscale x 2 x i1> %pg,
+ i16* %base,
+ <vscale x 2 x i64> %b)
+ %res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gldff1w_d(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %offsets) {
+; CHECK-LABEL: gldff1w_d:
+; CHECK: ldff1w { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ldff1.gather.nxv2i32(<vscale x 2 x i1> %pg,
+ i32* %base,
+ <vscale x 2 x i64> %offsets)
+ %res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gldff1d_d(<vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gldff1d_d:
+; CHECK: ldff1d { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ldff1.gather.nxv2i64(<vscale x 2 x i1> %pg,
+ i64* %base,
+ <vscale x 2 x i64> %b)
+ ret <vscale x 2 x i64> %load
+}
+
+define <vscale x 2 x double> @gldff1d_d_double(<vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gldff1d_d_double:
+; CHECK: ldff1d { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x double> @llvm.aarch64.sve.ldff1.gather.nxv2f64(<vscale x 2 x i1> %pg,
+ double* %base,
+ <vscale x 2 x i64> %b)
+ ret <vscale x 2 x double> %load
+}
+
+;
+; LDFF1SB, LDFF1SW, LDFF1SH: base + 64-bit unscaled offset
+; e.g. ldff1sh { z0.d }, p0/z, [x0, z0.d]
+;
+
+define <vscale x 2 x i64> @gldff1sb_d(<vscale x 2 x i1> %pg, i8* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gldff1sb_d:
+; CHECK: ldff1sb { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ldff1.gather.nxv2i8(<vscale x 2 x i1> %pg,
+ i8* %base,
+ <vscale x 2 x i64> %b)
+ %res = sext <vscale x 2 x i8> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gldff1sh_d(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gldff1sh_d:
+; CHECK: ldff1sh { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ldff1.gather.nxv2i16(<vscale x 2 x i1> %pg,
+ i16* %base,
+ <vscale x 2 x i64> %b)
+ %res = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gldff1sw_d(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %offsets) {
+; CHECK-LABEL: gldff1sw_d:
+; CHECK: ldff1sw { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ldff1.gather.nxv2i32(<vscale x 2 x i1> %pg,
+ i32* %base,
+ <vscale x 2 x i64> %offsets)
+ %res = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+declare <vscale x 2 x i8> @llvm.aarch64.sve.ldff1.gather.nxv2i8(<vscale x 2 x i1>, i8*, <vscale x 2 x i64>)
+declare <vscale x 2 x i16> @llvm.aarch64.sve.ldff1.gather.nxv2i16(<vscale x 2 x i1>, i16*, <vscale x 2 x i64>)
+declare <vscale x 2 x i32> @llvm.aarch64.sve.ldff1.gather.nxv2i32(<vscale x 2 x i1>, i32*, <vscale x 2 x i64>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.ldff1.gather.nxv2i64(<vscale x 2 x i1>, i64*, <vscale x 2 x i64>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.ldff1.gather.nxv2f64(<vscale x 2 x i1>, double*, <vscale x 2 x i64>)
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-ff-gather-loads-vector-base-imm-offset.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-ff-gather-loads-vector-base-imm-offset.ll
new file mode 100644
index 000000000000..5cb887932eff
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-ff-gather-loads-vector-base-imm-offset.ll
@@ -0,0 +1,368 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+
+;
+; LDFF1B, LDFF1W, LDFF1H, LDFF1D: vector base + immediate offset (index)
+; e.g. ldff1h { z0.s }, p0/z, [z0.s, #16]
+;
+
+; LDFF1B
+define <vscale x 4 x i32> @gldff1b_s_imm_offset(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %base) {
+; CHECK-LABEL: gldff1b_s_imm_offset:
+; CHECK: ldff1b { z0.s }, p0/z, [z0.s, #16]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv4i8.nxv4i32(<vscale x 4 x i1> %pg,
+ <vscale x 4 x i32> %base,
+ i64 16)
+ %res = zext <vscale x 4 x i8> %load to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 2 x i64> @gldff1b_d_imm_offset(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base) {
+; CHECK-LABEL: gldff1b_d_imm_offset:
+; CHECK: ldff1b { z0.d }, p0/z, [z0.d, #16]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i8.nxv2i64(<vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %base,
+ i64 16)
+ %res = zext <vscale x 2 x i8> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+; LDFF1H
+define <vscale x 4 x i32> @gldff1h_s_imm_offset(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %base) {
+; CHECK-LABEL: gldff1h_s_imm_offset:
+; CHECK: ldff1h { z0.s }, p0/z, [z0.s, #16]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv4i16.nxv4i32(<vscale x 4 x i1> %pg,
+ <vscale x 4 x i32> %base,
+ i64 16)
+ %res = zext <vscale x 4 x i16> %load to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 2 x i64> @gldff1h_d_imm_offset(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base) {
+; CHECK-LABEL: gldff1h_d_imm_offset:
+; CHECK: ldff1h { z0.d }, p0/z, [z0.d, #16]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i16.nxv2i64(<vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %base,
+ i64 16)
+ %res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+; LDFF1W
+define <vscale x 4 x i32> @gldff1w_s_imm_offset(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %base) {
+; CHECK-LABEL: gldff1w_s_imm_offset:
+; CHECK: ldff1w { z0.s }, p0/z, [z0.s, #16]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x i32> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv4i32.nxv4i32(<vscale x 4 x i1> %pg,
+ <vscale x 4 x i32> %base,
+ i64 16)
+ ret <vscale x 4 x i32> %load
+}
+
+define <vscale x 2 x i64> @gldff1w_d_imm_offset(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base) {
+; CHECK-LABEL: gldff1w_d_imm_offset:
+; CHECK: ldff1w { z0.d }, p0/z, [z0.d, #16]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i32.nxv2i64(<vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %base,
+ i64 16)
+ %res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 4 x float> @gldff1w_s_imm_offset_float(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %base) {
+; CHECK-LABEL: gldff1w_s_imm_offset_float:
+; CHECK: ldff1w { z0.s }, p0/z, [z0.s, #16]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x float> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv4f32.nxv4i32(<vscale x 4 x i1> %pg,
+ <vscale x 4 x i32> %base,
+ i64 16)
+ ret <vscale x 4 x float> %load
+}
+
+; LDFF1D
+define <vscale x 2 x i64> @gldff1d_d_imm_offset(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base) {
+; CHECK-LABEL: gldff1d_d_imm_offset:
+; CHECK: ldff1d { z0.d }, p0/z, [z0.d, #16]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %base,
+ i64 16)
+ ret <vscale x 2 x i64> %load
+}
+
+define <vscale x 2 x double> @gldff1d_d_imm_offset_double(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base) {
+; CHECK-LABEL: gldff1d_d_imm_offset_double:
+; CHECK: ldff1d { z0.d }, p0/z, [z0.d, #16]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x double> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2f64.nxv2i64(<vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %base,
+ i64 16)
+ ret <vscale x 2 x double> %load
+}
+
+;
+; LDFF1SB, LDFF1SW, LDFF1SH: vector base + immediate offset (index)
+; e.g. ldff1sh { z0.s }, p0/z, [z0.s, #16]
+;
+
+; LDFF1SB
+define <vscale x 4 x i32> @gldff1sb_s_imm_offset(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %base) {
+; CHECK-LABEL: gldff1sb_s_imm_offset:
+; CHECK: ldff1sb { z0.s }, p0/z, [z0.s, #16]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv4i8.nxv4i32(<vscale x 4 x i1> %pg,
+ <vscale x 4 x i32> %base,
+ i64 16)
+ %res = sext <vscale x 4 x i8> %load to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 2 x i64> @gldff1sb_d_imm_offset(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base) {
+; CHECK-LABEL: gldff1sb_d_imm_offset:
+; CHECK: ldff1sb { z0.d }, p0/z, [z0.d, #16]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i8.nxv2i64(<vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %base,
+ i64 16)
+ %res = sext <vscale x 2 x i8> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+; LDFF1SH
+define <vscale x 4 x i32> @gldff1sh_s_imm_offset(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %base) {
+; CHECK-LABEL: gldff1sh_s_imm_offset:
+; CHECK: ldff1sh { z0.s }, p0/z, [z0.s, #16]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv4i16.nxv4i32(<vscale x 4 x i1> %pg,
+ <vscale x 4 x i32> %base,
+ i64 16)
+ %res = sext <vscale x 4 x i16> %load to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 2 x i64> @gldff1sh_d_imm_offset(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base) {
+; CHECK-LABEL: gldff1sh_d_imm_offset:
+; CHECK: ldff1sh { z0.d }, p0/z, [z0.d, #16]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i16.nxv2i64(<vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %base,
+ i64 16)
+ %res = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+; LDFF1SW
+define <vscale x 2 x i64> @gldff1sw_d_imm_offset(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base) {
+; CHECK-LABEL: gldff1sw_d_imm_offset:
+; CHECK: ldff1sw { z0.d }, p0/z, [z0.d, #16]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i32.nxv2i64(<vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %base,
+ i64 16)
+ %res = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+;
+; LDFF1B, LDFF1W, LDFF1H, LDFF1D: vector base + out of range immediate offset
+; e.g. ldff1b { z0.d }, p0/z, [x0, z0.d]
+;
+
+; LDFF1B
+define <vscale x 4 x i32> @gldff1b_s_imm_offset_out_of_range(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %base) {
+; CHECK-LABEL: gldff1b_s_imm_offset_out_of_range:
+; CHECK: mov w8, #32
+; CHECK-NEXT: ldff1b { z0.s }, p0/z, [x8, z0.s, uxtw]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv4i8.nxv4i32(<vscale x 4 x i1> %pg,
+ <vscale x 4 x i32> %base,
+ i64 32)
+ %res = zext <vscale x 4 x i8> %load to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 2 x i64> @gldff1b_d_imm_offset_out_of_range(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base) {
+; CHECK-LABEL: gldff1b_d_imm_offset_out_of_range:
+; CHECK: mov w8, #32
+; CHECK-NEXT: ldff1b { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i8.nxv2i64(<vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %base,
+ i64 32)
+ %res = zext <vscale x 2 x i8> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+; LDFF1H
+define <vscale x 4 x i32> @gldff1h_s_imm_offset_out_of_range(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %base) {
+; CHECK-LABEL: gldff1h_s_imm_offset_out_of_range:
+; CHECK: mov w8, #63
+; CHECK-NEXT: ldff1h { z0.s }, p0/z, [x8, z0.s, uxtw]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv4i16.nxv4i32(<vscale x 4 x i1> %pg,
+ <vscale x 4 x i32> %base,
+ i64 63)
+ %res = zext <vscale x 4 x i16> %load to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 2 x i64> @gldff1h_d_imm_offset_out_of_range(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base) {
+; CHECK-LABEL: gldff1h_d_imm_offset_out_of_range:
+; CHECK: mov w8, #63
+; CHECK-NEXT: ldff1h { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i16.nxv2i64(<vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %base,
+ i64 63)
+ %res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+; LDFF1W
+define <vscale x 4 x i32> @gldff1w_s_imm_offset_out_of_range(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %base) {
+; CHECK-LABEL: gldff1w_s_imm_offset_out_of_range:
+; CHECK: mov w8, #125
+; CHECK-NEXT: ldff1w { z0.s }, p0/z, [x8, z0.s, uxtw]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x i32> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv4i32.nxv4i32(<vscale x 4 x i1> %pg,
+ <vscale x 4 x i32> %base,
+ i64 125)
+ ret <vscale x 4 x i32> %load
+}
+
+define <vscale x 2 x i64> @gldff1w_d_imm_offset_out_of_range(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base) {
+; CHECK-LABEL: gldff1w_d_imm_offset_out_of_range:
+; CHECK: mov w8, #125
+; CHECK-NEXT: ldff1w { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i32.nxv2i64(<vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %base,
+ i64 125)
+ %res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 4 x float> @gldff1w_s_imm_offset_out_of_range_float(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %base) {
+; CHECK-LABEL: gldff1w_s_imm_offset_out_of_range_float:
+; CHECK: mov w8, #125
+; CHECK-NEXT: ldff1w { z0.s }, p0/z, [x8, z0.s, uxtw]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x float> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv4f32.nxv4i32(<vscale x 4 x i1> %pg,
+ <vscale x 4 x i32> %base,
+ i64 125)
+ ret <vscale x 4 x float> %load
+}
+
+; LDFF1D
+define <vscale x 2 x i64> @gldff1d_d_imm_offset_out_of_range(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base) {
+; CHECK-LABEL: gldff1d_d_imm_offset_out_of_range:
+; CHECK: mov w8, #249
+; CHECK-NEXT: ldff1d { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %base,
+ i64 249)
+ ret <vscale x 2 x i64> %load
+}
+
+define <vscale x 2 x double> @gldff1d_d_imm_offset_out_of_range_double(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base) {
+; CHECK-LABEL: gldff1d_d_imm_offset_out_of_range_double:
+; CHECK: mov w8, #249
+; CHECK-NEXT: ldff1d { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x double> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2f64.nxv2i64(<vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %base,
+ i64 249)
+ ret <vscale x 2 x double> %load
+}
+
+;
+; LDFF1SB, LDFF1SW, LDFF1SH: vector base + out of range immediate offset
+; e.g. ldff1sb { z0.s }, p0/z, [x8, z0.s, uxtw]
+;
+
+; LDFF1SB
+define <vscale x 4 x i32> @gldff1sb_s_imm_offset_out_of_range(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %base) {
+; CHECK-LABEL: gldff1sb_s_imm_offset_out_of_range:
+; CHECK: mov w8, #32
+; CHECK-NEXT: ldff1sb { z0.s }, p0/z, [x8, z0.s, uxtw]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv4i8.nxv4i32(<vscale x 4 x i1> %pg,
+ <vscale x 4 x i32> %base,
+ i64 32)
+ %res = sext <vscale x 4 x i8> %load to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 2 x i64> @gldff1sb_d_imm_offset_out_of_range(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base) {
+; CHECK-LABEL: gldff1sb_d_imm_offset_out_of_range:
+; CHECK: mov w8, #32
+; CHECK-NEXT: ldff1sb { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i8.nxv2i64(<vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %base,
+ i64 32)
+ %res = sext <vscale x 2 x i8> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+; LDFF1SH
+define <vscale x 4 x i32> @gldff1sh_s_imm_offset_out_of_range(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %base) {
+; CHECK-LABEL: gldff1sh_s_imm_offset_out_of_range:
+; CHECK: mov w8, #63
+; CHECK-NEXT: ldff1sh { z0.s }, p0/z, [x8, z0.s, uxtw]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv4i16.nxv4i32(<vscale x 4 x i1> %pg,
+ <vscale x 4 x i32> %base,
+ i64 63)
+ %res = sext <vscale x 4 x i16> %load to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 2 x i64> @gldff1sh_d_imm_offset_out_of_range(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base) {
+; CHECK-LABEL: gldff1sh_d_imm_offset_out_of_range:
+; CHECK: mov w8, #63
+; CHECK-NEXT: ldff1sh { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i16.nxv2i64(<vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %base,
+ i64 63)
+ %res = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+; LDFF1SW
+define <vscale x 2 x i64> @gldff1sw_d_imm_offset_out_of_range(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base) {
+; CHECK-LABEL: gldff1sw_d_imm_offset_out_of_range:
+; CHECK: mov w8, #125
+; CHECK-NEXT: ldff1sw { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i32.nxv2i64(<vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %base,
+ i64 125)
+ %res = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+; LDFF1B/LDFF1SB
+declare <vscale x 4 x i8> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv4i8.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, i64)
+declare <vscale x 2 x i8> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i8.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, i64)
+
+; LDFF1H/LDFF1SH
+declare <vscale x 4 x i16> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv4i16.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, i64)
+declare <vscale x 2 x i16> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i16.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, i64)
+
+; LDFF1W/LDFF1SW
+declare <vscale x 4 x i32> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv4i32.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, i64)
+declare <vscale x 2 x i32> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i32.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, i64)
+
+declare <vscale x 4 x float> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv4f32.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, i64)
+
+; LDFF1D
+declare <vscale x 2 x i64> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, i64)
+
+declare <vscale x 2 x double> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2f64.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, i64)
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-ff-gather-loads-vector-base-scalar-offset.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-ff-gather-loads-vector-base-scalar-offset.ll
new file mode 100644
index 000000000000..4b9fba9dc275
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-ff-gather-loads-vector-base-scalar-offset.ll
@@ -0,0 +1,186 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+
+;
+; LDFF1B, LDFF1W, LDFF1H, LDFF1D: vector base + scalar offset (index)
+; e.g. ldff1b { z0.d }, p0/z, [x0, z0.d]
+;
+
+; LDFF1B
+define <vscale x 4 x i32> @gldff1b_s_scalar_offset(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %base, i64 %offset) {
+; CHECK-LABEL: gldff1b_s_scalar_offset:
+; CHECK: ldff1b { z0.s }, p0/z, [x0, z0.s, uxtw]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv4i8.nxv4i32(<vscale x 4 x i1> %pg,
+ <vscale x 4 x i32> %base,
+ i64 %offset)
+ %res = zext <vscale x 4 x i8> %load to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 2 x i64> @gldff1b_d_scalar_offset(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 %offset) {
+; CHECK-LABEL: gldff1b_d_scalar_offset:
+; CHECK: ldff1b { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i8.nxv2i64(<vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %base,
+ i64 %offset)
+ %res = zext <vscale x 2 x i8> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+; LDFF1H
+define <vscale x 4 x i32> @gldff1h_s_scalar_offset(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %base, i64 %offset) {
+; CHECK-LABEL: gldff1h_s_scalar_offset:
+; CHECK: ldff1h { z0.s }, p0/z, [x0, z0.s, uxtw]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv4i16.nxv4i32(<vscale x 4 x i1> %pg,
+ <vscale x 4 x i32> %base,
+ i64 %offset)
+ %res = zext <vscale x 4 x i16> %load to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 2 x i64> @gldff1h_d_scalar_offset(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 %offset) {
+; CHECK-LABEL: gldff1h_d_scalar_offset:
+; CHECK: ldff1h { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i16.nxv2i64(<vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %base,
+ i64 %offset)
+ %res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+; LDFF1W
+define <vscale x 4 x i32> @gldff1w_s_scalar_offset(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %base, i64 %offset) {
+; CHECK-LABEL: gldff1w_s_scalar_offset:
+; CHECK: ldff1w { z0.s }, p0/z, [x0, z0.s, uxtw]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x i32> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv4i32.nxv4i32(<vscale x 4 x i1> %pg,
+ <vscale x 4 x i32> %base,
+ i64 %offset)
+ ret <vscale x 4 x i32> %load
+}
+
+define <vscale x 2 x i64> @gldff1w_d_scalar_offset(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 %offset) {
+; CHECK-LABEL: gldff1w_d_scalar_offset:
+; CHECK: ldff1w { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i32.nxv2i64(<vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %base,
+ i64 %offset)
+ %res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 4 x float> @gldff1w_s_scalar_offset_float(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %base, i64 %offset) {
+; CHECK-LABEL: gldff1w_s_scalar_offset_float:
+; CHECK: ldff1w { z0.s }, p0/z, [x0, z0.s, uxtw]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x float> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv4f32.nxv4i32(<vscale x 4 x i1> %pg,
+ <vscale x 4 x i32> %base,
+ i64 %offset)
+ ret <vscale x 4 x float> %load
+}
+
+; LDFF1D
+define <vscale x 2 x i64> @gldff1d_d_scalar_offset(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 %offset) {
+; CHECK-LABEL: gldff1d_d_scalar_offset:
+; CHECK: ldff1d { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %base,
+ i64 %offset)
+ ret <vscale x 2 x i64> %load
+}
+
+define <vscale x 2 x double> @gldff1d_d_scalar_offset_double(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 %offset) {
+; CHECK-LABEL: gldff1d_d_scalar_offset_double:
+; CHECK: ldff1d { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x double> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2f64.nxv2i64(<vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %base,
+ i64 %offset)
+ ret <vscale x 2 x double> %load
+}
+
+; LDFF1SB, LDFF1SW, LDFF1SH: vector base + scalar offset (index)
+; e.g. ldff1b { z0.d }, p0/z, [x0, z0.d]
+;
+
+; LDFF1SB
+define <vscale x 4 x i32> @gldff1sb_s_scalar_offset(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %base, i64 %offset) {
+; CHECK-LABEL: gldff1sb_s_scalar_offset:
+; CHECK: ldff1sb { z0.s }, p0/z, [x0, z0.s, uxtw]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv4i8.nxv4i32(<vscale x 4 x i1> %pg,
+ <vscale x 4 x i32> %base,
+ i64 %offset)
+ %res = sext <vscale x 4 x i8> %load to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 2 x i64> @gldff1sb_d_scalar_offset(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 %offset) {
+; CHECK-LABEL: gldff1sb_d_scalar_offset:
+; CHECK: ldff1sb { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i8.nxv2i64(<vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %base,
+ i64 %offset)
+ %res = sext <vscale x 2 x i8> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+; LDFF1SH
+define <vscale x 4 x i32> @gldff1sh_s_scalar_offset(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %base, i64 %offset) {
+; CHECK-LABEL: gldff1sh_s_scalar_offset:
+; CHECK: ldff1sh { z0.s }, p0/z, [x0, z0.s, uxtw]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv4i16.nxv4i32(<vscale x 4 x i1> %pg,
+ <vscale x 4 x i32> %base,
+ i64 %offset)
+ %res = sext <vscale x 4 x i16> %load to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 2 x i64> @gldff1sh_d_scalar_offset(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 %offset) {
+; CHECK-LABEL: gldff1sh_d_scalar_offset:
+; CHECK: ldff1sh { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i16.nxv2i64(<vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %base,
+ i64 %offset)
+ %res = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+; LDFF1SW
+define <vscale x 2 x i64> @gldff1sw_d_scalar_offset(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 %offset) {
+; CHECK-LABEL: gldff1sw_d_scalar_offset:
+; CHECK: ldff1sw { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i32.nxv2i64(<vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %base,
+ i64 %offset)
+ %res = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+; LDFF1B/LDFF1SB
+declare <vscale x 4 x i8> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv4i8.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, i64)
+declare <vscale x 2 x i8> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i8.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, i64)
+
+; LDFF1H/LDFF1SH
+declare <vscale x 4 x i16> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv4i16.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, i64)
+declare <vscale x 2 x i16> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i16.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, i64)
+
+; LDFF1W/LDFF1SW
+declare <vscale x 4 x i32> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv4i32.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, i64)
+declare <vscale x 2 x i32> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i32.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, i64)
+
+declare <vscale x 4 x float> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv4f32.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, i64)
+
+; LDFF1D
+declare <vscale x 2 x i64> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, i64)
+
+declare <vscale x 2 x double> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2f64.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, i64)
More information about the llvm-commits
mailing list