[llvm] 6e51ceb - [AArch64][SVE] Add intrinsics for gather loads with 64-bit offsets
Sander de Smalen via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 3 04:56:01 PST 2019
Author: Sander de Smalen
Date: 2019-12-03T12:55:03Z
New Revision: 6e51ceba536d88f882737c9c4f9ff0ffb0004bfd
URL: https://github.com/llvm/llvm-project/commit/6e51ceba536d88f882737c9c4f9ff0ffb0004bfd
DIFF: https://github.com/llvm/llvm-project/commit/6e51ceba536d88f882737c9c4f9ff0ffb0004bfd.diff
LOG: [AArch64][SVE] Add intrinsics for gather loads with 64-bit offsets
This patch adds the following intrinsics for gather loads with 64-bit offsets:
* @llvm.aarch64.sve.ld1.gather (unscaled offset)
* @llvm.aarch64.sve.ld1.gather.index (scaled offset)
These intrinsics map 1-1 to the following AArch64 instructions respectively (examples for half-words):
* ld1h { z0.d }, p0/z, [x0, z0.d]
* ld1h { z0.d }, p0/z, [x0, z0.d, lsl #1]
Committing on behalf of Andrzej Warzynski (andwar)
Reviewers: sdesmalen, huntergr, rovka, mgudim, dancgr, rengolin, efriedma
Reviewed By: efriedma
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D70542
Added:
llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-scaled-offset.ll
llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-unscaled-offset.ll
Modified:
llvm/include/llvm/IR/IntrinsicsAArch64.td
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/lib/Target/AArch64/AArch64ISelLowering.h
llvm/lib/Target/AArch64/AArch64InstrFormats.td
llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
llvm/lib/Target/AArch64/SVEInstrFormats.td
llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
Removed:
################################################################################
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 72bc4a2aa216..d4ed3d7b8ec5 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -942,6 +942,15 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.".
llvm_i32_ty],
[IntrNoMem]>;
+class AdvSIMD_GatherLoad_64bitOffset_Intrinsic
+ : Intrinsic<[llvm_anyvector_ty],
+ [
+ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+ LLVMPointerToElt<0>,
+ LLVMScalarOrSameVectorWidth<0, llvm_i64_ty>
+ ],
+ [IntrReadMem, IntrArgMemOnly]>;
+
// This class of intrinsics are not intended to be useful within LLVM IR but
// are instead here to support some of the more regid parts of the ACLE.
class Builtin_SVCVT<string name, LLVMType OUT, LLVMType IN>
@@ -1172,4 +1181,14 @@ def int_aarch64_sve_ucvtf_f64i32 : Builtin_SVCVT<"svcvt_f64_u32_m", llvm_nxv2
def int_aarch64_sve_punpkhi : AdvSIMD_SVE_PUNPKHI_Intrinsic;
def int_aarch64_sve_punpklo : AdvSIMD_SVE_PUNPKHI_Intrinsic;
+
+//
+// Gather loads:
+//
+
+// scalar + vector, 64 bit unscaled offsets
+def int_aarch64_sve_ld1_gather : AdvSIMD_GatherLoad_64bitOffset_Intrinsic;
+
+// scalar + vector, 64 bit scaled offsets
+def int_aarch64_sve_ld1_gather_index : AdvSIMD_GatherLoad_64bitOffset_Intrinsic;
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 461d781effb8..0f6c2c5a628d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1336,6 +1336,8 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
case AArch64ISD::UUNPKHI: return "AArch64ISD::UUNPKHI";
case AArch64ISD::UUNPKLO: return "AArch64ISD::UUNPKLO";
case AArch64ISD::INSR: return "AArch64ISD::INSR";
+ case AArch64ISD::GLD1: return "AArch64ISD::GLD1";
+ case AArch64ISD::GLD1_SCALED: return "AArch64ISD::GLD1_SCALED";
}
return nullptr;
}
@@ -11760,6 +11762,85 @@ static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG,
DAG.getConstant(MinOffset, DL, MVT::i64));
}
+// Returns an SVE type that ContentTy can be trivially sign or zero extended
+// into.
+static MVT getSVEContainerType(EVT ContentTy) {
+ assert(ContentTy.isSimple() && "No SVE containers for extended types");
+
+ switch (ContentTy.getSimpleVT().SimpleTy) {
+ default:
+ llvm_unreachable("No known SVE container for this MVT type");
+ case MVT::nxv2i8:
+ case MVT::nxv2i16:
+ case MVT::nxv2i32:
+ case MVT::nxv2i64:
+ case MVT::nxv2f32:
+ case MVT::nxv2f64:
+ return MVT::nxv2i64;
+ case MVT::nxv4i8:
+ case MVT::nxv4i16:
+ case MVT::nxv4i32:
+ case MVT::nxv4f32:
+ return MVT::nxv4i32;
+ }
+}
+
+static SDValue performLD1GatherCombine(SDNode *N, SelectionDAG &DAG,
+ unsigned Opcode) {
+ EVT RetVT = N->getValueType(0);
+ assert(RetVT.isScalableVector() &&
+ "Gather loads are only possible for SVE vectors");
+
+ SDLoc DL(N);
+ MVT RetElVT = RetVT.getVectorElementType().getSimpleVT();
+ unsigned NumElements = AArch64::SVEBitsPerBlock / RetElVT.getSizeInBits();
+
+ EVT MaxVT = llvm::MVT::getScalableVectorVT(RetElVT, NumElements);
+ if (RetVT.getSizeInBits().getKnownMinSize() >
+ MaxVT.getSizeInBits().getKnownMinSize())
+ return SDValue();
+
+ // Depending on the addressing mode, this is either a pointer or a vector of
+ // pointers (that fits into one register)
+ const SDValue Base = N->getOperand(3);
+ // Depending on the addressing mode, this is either a single offset or a
+ // vector of offsets (that fits into one register)
+ const SDValue Offset = N->getOperand(4);
+
+ if (!DAG.getTargetLoweringInfo().isTypeLegal(Base.getValueType()) ||
+ !DAG.getTargetLoweringInfo().isTypeLegal(Offset.getValueType()))
+ return SDValue();
+
+ // Return value type that is representable in hardware
+ EVT HwRetVt = getSVEContainerType(RetVT);
+
+ // Keep the original output value type around - this will better inform
+ // optimisations (e.g. instruction folding when load is followed by
+ // zext/sext). This will only be used for ints, so the value for FPs
+ // doesn't matter.
+ SDValue OutVT = DAG.getValueType(RetVT);
+ if (RetVT.isFloatingPoint())
+ OutVT = DAG.getValueType(HwRetVt);
+
+ SDVTList VTs = DAG.getVTList(HwRetVt, MVT::Other);
+ SDValue Ops[] = {N->getOperand(0), // Chain
+ N->getOperand(2), // Pg
+ Base, Offset, OutVT};
+
+ SDValue Load = DAG.getNode(Opcode, DL, VTs, Ops);
+ SDValue LoadChain = SDValue(Load.getNode(), 1);
+
+ if (RetVT.isInteger() && (RetVT != HwRetVt))
+ Load = DAG.getNode(ISD::TRUNCATE, DL, RetVT, Load.getValue(0));
+
+ // If the original return value was FP, bitcast accordingly. Doing it here
+ // means that we can avoid adding TableGen patterns for FPs.
+ if (RetVT.isFloatingPoint())
+ Load = DAG.getNode(ISD::BITCAST, DL, RetVT, Load.getValue(0));
+
+ return DAG.getMergeValues({Load, LoadChain}, DL);
+}
+
SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -11846,6 +11927,10 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
case Intrinsic::aarch64_neon_st3lane:
case Intrinsic::aarch64_neon_st4lane:
return performNEONPostLDSTCombine(N, DCI, DAG);
+ case Intrinsic::aarch64_sve_ld1_gather:
+ return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1);
+ case Intrinsic::aarch64_sve_ld1_gather_index:
+ return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_SCALED);
default:
break;
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 180dd50dc396..52bb0f25aa9a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -198,6 +198,10 @@ enum NodeType : unsigned {
INSR,
+ // Unsigned gather loads.
+ GLD1,
+ GLD1_SCALED,
+
// NEON Load/Store with post-increment base updates
LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE,
LD3post,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 878cb79eb326..fee825422ca4 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -358,6 +358,16 @@ def am_indexed7s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S128", []>;
def am_indexedu6s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexedU6S128", []>;
def am_indexeds9s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexedS9S128", []>;
+def UImmS2XForm : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getZExtValue() / 2, SDLoc(N), MVT::i64);
+}]>;
+def UImmS4XForm : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getZExtValue() / 4, SDLoc(N), MVT::i64);
+}]>;
+def UImmS8XForm : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getZExtValue() / 8, SDLoc(N), MVT::i64);
+}]>;
+
// uimm5sN predicate - True if the immediate is a multiple of N in the range
// [0 * N, 32 * N].
def UImm5s2Operand : UImmScaledMemoryIndexed<5, 2>;
@@ -365,17 +375,20 @@ def UImm5s4Operand : UImmScaledMemoryIndexed<5, 4>;
def UImm5s8Operand : UImmScaledMemoryIndexed<5, 8>;
def uimm5s2 : Operand<i64>, ImmLeaf<i64,
- [{ return Imm >= 0 && Imm < (32*2) && ((Imm % 2) == 0); }]> {
+ [{ return Imm >= 0 && Imm < (32*2) && ((Imm % 2) == 0); }],
+ UImmS2XForm> {
let ParserMatchClass = UImm5s2Operand;
let PrintMethod = "printImmScale<2>";
}
def uimm5s4 : Operand<i64>, ImmLeaf<i64,
- [{ return Imm >= 0 && Imm < (32*4) && ((Imm % 4) == 0); }]> {
+ [{ return Imm >= 0 && Imm < (32*4) && ((Imm % 4) == 0); }],
+ UImmS4XForm> {
let ParserMatchClass = UImm5s4Operand;
let PrintMethod = "printImmScale<4>";
}
def uimm5s8 : Operand<i64>, ImmLeaf<i64,
- [{ return Imm >= 0 && Imm < (32*8) && ((Imm % 8) == 0); }]> {
+ [{ return Imm >= 0 && Imm < (32*8) && ((Imm % 8) == 0); }],
+ UImmS8XForm> {
let ParserMatchClass = UImm5s8Operand;
let PrintMethod = "printImmScale<8>";
}
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 43e5ac058885..575adeebc595 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -10,6 +10,14 @@
//
//===----------------------------------------------------------------------===//
+def SDT_AArch64_GLD1 : SDTypeProfile<1, 4, [
+ SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>, SDTCisVec<3>, SDTCisVT<4, OtherVT>,
+ SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
+]>;
+
+def AArch64ld1_gather : SDNode<"AArch64ISD::GLD1", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
+def AArch64ld1_gather_scaled : SDNode<"AArch64ISD::GLD1_SCALED", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
+
let Predicates = [HasSVE] in {
def RDFFR_PPz : sve_int_rdffr_pred<0b0, "rdffr">;
@@ -454,33 +462,33 @@ let Predicates = [HasSVE] in {
// Gathers using unscaled 64-bit offsets, e.g.
// ld1h z0.d, p0/z, [x0, z0.d]
- defm GLD1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0000, "ld1sb">;
- defm GLDFF1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0001, "ldff1sb">;
- defm GLD1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0010, "ld1b">;
- defm GLDFF1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0011, "ldff1b">;
- defm GLD1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0100, "ld1sh">;
- defm GLDFF1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0101, "ldff1sh">;
- defm GLD1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0110, "ld1h">;
- defm GLDFF1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0111, "ldff1h">;
- defm GLD1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1000, "ld1sw">;
- defm GLDFF1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1001, "ldff1sw">;
- defm GLD1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1010, "ld1w">;
- defm GLDFF1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1011, "ldff1w">;
- defm GLD1D : sve_mem_64b_gld_vs2_64_unscaled<0b1110, "ld1d">;
- defm GLDFF1D : sve_mem_64b_gld_vs2_64_unscaled<0b1111, "ldff1d">;
+ defm GLD1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0000, "ld1sb", null_frag, nxv2i8>;
+ defm GLDFF1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0001, "ldff1sb", null_frag, nxv2i8>;
+ defm GLD1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0010, "ld1b", AArch64ld1_gather, nxv2i8>;
+ defm GLDFF1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0011, "ldff1b", null_frag, nxv2i8>;
+ defm GLD1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0100, "ld1sh", null_frag, nxv2i16>;
+ defm GLDFF1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0101, "ldff1sh", null_frag, nxv2i16>;
+ defm GLD1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0110, "ld1h", AArch64ld1_gather, nxv2i16>;
+ defm GLDFF1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0111, "ldff1h", null_frag, nxv2i16>;
+ defm GLD1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1000, "ld1sw", null_frag, nxv2i32>;
+ defm GLDFF1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1001, "ldff1sw", null_frag, nxv2i32>;
+ defm GLD1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1010, "ld1w", AArch64ld1_gather, nxv2i32>;
+ defm GLDFF1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1011, "ldff1w", null_frag, nxv2i32>;
+ defm GLD1D : sve_mem_64b_gld_vs2_64_unscaled<0b1110, "ld1d", AArch64ld1_gather, nxv2i64>;
+ defm GLDFF1D : sve_mem_64b_gld_vs2_64_unscaled<0b1111, "ldff1d", null_frag, nxv2i64>;
// Gathers using scaled 64-bit offsets, e.g.
// ld1h z0.d, p0/z, [x0, z0.d, lsl #1]
- defm GLD1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0100, "ld1sh", ZPR64ExtLSL16>;
- defm GLDFF1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0101, "ldff1sh", ZPR64ExtLSL16>;
- defm GLD1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0110, "ld1h", ZPR64ExtLSL16>;
- defm GLDFF1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0111, "ldff1h", ZPR64ExtLSL16>;
- defm GLD1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1000, "ld1sw", ZPR64ExtLSL32>;
- defm GLDFF1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1001, "ldff1sw", ZPR64ExtLSL32>;
- defm GLD1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1010, "ld1w", ZPR64ExtLSL32>;
- defm GLDFF1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1011, "ldff1w", ZPR64ExtLSL32>;
- defm GLD1D : sve_mem_64b_gld_sv2_64_scaled<0b1110, "ld1d", ZPR64ExtLSL64>;
- defm GLDFF1D : sve_mem_64b_gld_sv2_64_scaled<0b1111, "ldff1d", ZPR64ExtLSL64>;
+ defm GLD1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0100, "ld1sh", null_frag, ZPR64ExtLSL16, nxv2i16>;
+ defm GLDFF1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0101, "ldff1sh", null_frag, ZPR64ExtLSL16, nxv2i16>;
+ defm GLD1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0110, "ld1h", AArch64ld1_gather_scaled, ZPR64ExtLSL16, nxv2i16>;
+ defm GLDFF1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0111, "ldff1h", null_frag, ZPR64ExtLSL16, nxv2i16>;
+ defm GLD1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1000, "ld1sw", null_frag, ZPR64ExtLSL32, nxv2i32>;
+ defm GLDFF1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1001, "ldff1sw", null_frag, ZPR64ExtLSL32, nxv2i32>;
+ defm GLD1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1010, "ld1w", AArch64ld1_gather_scaled, ZPR64ExtLSL32, nxv2i32>;
+ defm GLDFF1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1011, "ldff1w", null_frag, ZPR64ExtLSL32, nxv2i32>;
+ defm GLD1D : sve_mem_64b_gld_sv2_64_scaled<0b1110, "ld1d", AArch64ld1_gather_scaled, ZPR64ExtLSL64, nxv2i64>;
+ defm GLDFF1D : sve_mem_64b_gld_sv2_64_scaled<0b1111, "ldff1d", null_frag, ZPR64ExtLSL64, nxv2i64>;
// Gathers using unscaled 32-bit offsets unpacked in 64-bits elements, e.g.
// ld1h z0.d, p0/z, [x0, z0.d, uxtw]
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 9169e463c662..68329a2a2e4f 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -5584,18 +5584,26 @@ multiclass sve_mem_64b_gld_vs_32_unscaled<bits<4> opc, string asm,
}
multiclass sve_mem_64b_gld_sv2_64_scaled<bits<4> opc, string asm,
- RegisterOperand zprext> {
+ SDPatternOperator op,
+ RegisterOperand zprext, ValueType vt> {
def _SCALED_REAL : sve_mem_64b_gld_sv<opc, 1, 1, 1, asm, zprext>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
(!cast<Instruction>(NAME # _SCALED_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm), 0>;
+
+ def : Pat<(nxv2i64 (op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$indices), vt)),
+ (!cast<Instruction>(NAME # _SCALED_REAL) PPR:$gp, GPR64sp:$base, ZPR:$indices)>;
}
-multiclass sve_mem_64b_gld_vs2_64_unscaled<bits<4> opc, string asm> {
+multiclass sve_mem_64b_gld_vs2_64_unscaled<bits<4> opc, string asm,
+ SDPatternOperator op, ValueType vt> {
def _REAL : sve_mem_64b_gld_sv<opc, 1, 0, 1, asm, ZPR64ExtLSL8>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
(!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm), 0>;
+
+ def : Pat<(nxv2i64 (op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt)),
+ (!cast<Instruction>(NAME # _REAL) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
}
class sve_mem_64b_gld_vi<bits<4> opc, string asm, Operand imm_ty>
diff --git a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index 7a4fcac09ec4..57c126fe6494 100644
--- a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -643,6 +643,17 @@ namespace AArch64II {
};
} // end namespace AArch64II
+namespace AArch64 {
+// The number of bits in a SVE register is architecturally defined
+// to be a multiple of this value. If <M x t> has this number of bits,
+// a <n x M x t> vector can be stored in a SVE register without any
+// redundant bits. If <M x t> has this number of bits divided by P,
+// a <n x M x t> vector is stored in a SVE register by placing index i
+// in index i*P of a <n x (M*P) x t> vector. The other elements of the
+// <n x (M*P) x t> vector (such as index 1) are undefined.
+static constexpr unsigned SVEBitsPerBlock = 128;
+} // end namespace AArch64
+
} // end namespace llvm
#endif
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-scaled-offset.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-scaled-offset.ll
new file mode 100644
index 000000000000..274eaad0eef1
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-scaled-offset.ll
@@ -0,0 +1,59 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+
+;
+; LD1H, LD1W, LD1D: base + 64-bit scaled offset
+; e.g. ld1h z0.d, p0/z, [x0, z0.d, lsl #1]
+;
+
+define <vscale x 2 x i64> @gld1h_index(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1h_index
+; CHECK: ld1h { z0.d }, p0/z, [x0, z0.d, lsl #1]
+; CHECK-NEXT: mov w8, #65535
+; CHECK-NEXT: mov z1.d, x8
+; CHECK-NEXT: and z0.d, z0.d, z1.d
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.index.nxv2i16(<vscale x 2 x i1> %pg,
+ i16* %base,
+ <vscale x 2 x i64> %b)
+ %res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gld1w_index(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1w_index
+; CHECK: ld1w { z0.d }, p0/z, [x0, z0.d, lsl #2]
+; CHECK-NEXT: mov w8, #-1
+; CHECK-NEXT: mov z1.d, x8
+; CHECK-NEXT: and z0.d, z0.d, z1.d
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.index.nxv2i32(<vscale x 2 x i1> %pg,
+ i32* %base,
+ <vscale x 2 x i64> %b)
+ %res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gld1d_index(<vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1d_index
+; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, lsl #3]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.index.nxv2i64(<vscale x 2 x i1> %pg,
+ i64* %base,
+ <vscale x 2 x i64> %b)
+ ret <vscale x 2 x i64> %load
+}
+
+define <vscale x 2 x double> @gld1d_index_double(<vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1d_index_double
+; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, lsl #3]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.index.nxv2f64(<vscale x 2 x i1> %pg,
+ double* %base,
+ <vscale x 2 x i64> %b)
+ ret <vscale x 2 x double> %load
+}
+
+declare <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.index.nxv2i16(<vscale x 2 x i1>, i16*, <vscale x 2 x i64>)
+declare <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.index.nxv2i32(<vscale x 2 x i1>, i32*, <vscale x 2 x i64>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.index.nxv2i64(<vscale x 2 x i1>, i64*, <vscale x 2 x i64>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.index.nxv2f64(<vscale x 2 x i1>, double*, <vscale x 2 x i64>)
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-unscaled-offset.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-unscaled-offset.ll
new file mode 100644
index 000000000000..9a8df453b336
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-unscaled-offset.ll
@@ -0,0 +1,74 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+
+;
+; LD1B, LD1W, LD1H, LD1D: base + 64-bit unscaled offset
+; e.g. ld1h { z0.d }, p0/z, [x0, z0.d]
+;
+
+define <vscale x 2 x i64> @gld1b_d(<vscale x 2 x i1> %pg, i8* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1b_d:
+; CHECK: ld1b { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT: mov w8, #255
+; CHECK-NEXT: mov z1.d, x8
+; CHECK-NEXT: and z0.d, z0.d, z1.d
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ld1.gather.nxv2i8(<vscale x 2 x i1> %pg,
+ i8* %base,
+ <vscale x 2 x i64> %b)
+ %res = zext <vscale x 2 x i8> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gld1h_d(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1h_d:
+; CHECK: ld1h { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT: mov w8, #65535
+; CHECK-NEXT: mov z1.d, x8
+; CHECK-NEXT: and z0.d, z0.d, z1.d
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.nxv2i16(<vscale x 2 x i1> %pg,
+ i16* %base,
+ <vscale x 2 x i64> %b)
+ %res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gld1w_d(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %offsets) {
+; CHECK-LABEL: gld1w_d:
+; CHECK: ld1w { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT: mov w8, #-1
+; CHECK-NEXT: mov z1.d, x8
+; CHECK-NEXT: and z0.d, z0.d, z1.d
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.nxv2i32(<vscale x 2 x i1> %pg,
+ i32* %base,
+ <vscale x 2 x i64> %offsets)
+ %res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gld1d_d(<vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1d_d:
+; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.nxv2i64(<vscale x 2 x i1> %pg,
+ i64* %base,
+ <vscale x 2 x i64> %b)
+ ret <vscale x 2 x i64> %load
+}
+
+define <vscale x 2 x double> @gld1d_d_double(<vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1d_d_double:
+; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.nxv2f64(<vscale x 2 x i1> %pg,
+ double* %base,
+ <vscale x 2 x i64> %b)
+ ret <vscale x 2 x double> %load
+}
+
+declare <vscale x 2 x i8> @llvm.aarch64.sve.ld1.gather.nxv2i8(<vscale x 2 x i1>, i8*, <vscale x 2 x i64>)
+declare <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.nxv2i16(<vscale x 2 x i1>, i16*, <vscale x 2 x i64>)
+declare <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.nxv2i32(<vscale x 2 x i1>, i32*, <vscale x 2 x i64>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.nxv2i64(<vscale x 2 x i1>, i64*, <vscale x 2 x i64>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.nxv2f64(<vscale x 2 x i1>, double*, <vscale x 2 x i64>)
More information about the llvm-commits
mailing list