[llvm] cdcc4f2 - [AArch64][SVE] Add intrinsic for non-faulting loads
Kerry McLaughlin via llvm-commits
llvm-commits at lists.llvm.org
Wed Jan 22 03:35:00 PST 2020
Author: Kerry McLaughlin
Date: 2020-01-22T11:15:20Z
New Revision: cdcc4f2a44b5ac0f32c3af844040994c6effaa2f
URL: https://github.com/llvm/llvm-project/commit/cdcc4f2a44b5ac0f32c3af844040994c6effaa2f
DIFF: https://github.com/llvm/llvm-project/commit/cdcc4f2a44b5ac0f32c3af844040994c6effaa2f.diff
LOG: [AArch64][SVE] Add intrinsic for non-faulting loads
Summary:
This patch adds the llvm.aarch64.sve.ldnf1 intrinsic, plus
DAG combine rules for non-faulting loads and sign/zero extends
Reviewers: sdesmalen, efriedma, andwar, dancgr, mgudim, rengolin
Reviewed By: sdesmalen
Subscribers: tschuett, kristof.beyls, hiraditya, rkruppe, psnobl, cameron.mcinally, cfe-commits, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D71698
Added:
llvm/test/CodeGen/AArch64/sve-intrinsics-loads-nf.ll
Modified:
llvm/include/llvm/IR/IntrinsicsAArch64.td
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/lib/Target/AArch64/AArch64ISelLowering.h
llvm/lib/Target/AArch64/AArch64InstrInfo.td
llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
llvm/lib/Target/AArch64/SVEInstrFormats.td
Removed:
################################################################################
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index f0d727edb69b..a1703b602c60 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -775,6 +775,12 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.".
LLVMPointerTo<0>],
[IntrReadMem, IntrArgMemOnly]>;
+ class AdvSIMD_1Vec_PredFaultingLoad_Intrinsic
+ : Intrinsic<[llvm_anyvector_ty],
+ [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+ LLVMPointerToElt<0>],
+ [IntrReadMem, IntrArgMemOnly]>;
+
class AdvSIMD_1Vec_PredStore_Intrinsic
: Intrinsic<[],
[llvm_anyvector_ty,
@@ -1169,6 +1175,8 @@ class AdvSIMD_ScatterStore_VectorBase_Intrinsic
def int_aarch64_sve_ldnt1 : AdvSIMD_1Vec_PredLoad_Intrinsic;
+def int_aarch64_sve_ldnf1 : AdvSIMD_1Vec_PredFaultingLoad_Intrinsic;
+
//
// Stores
//
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7cbc441bec7a..e3e8b5f48020 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1373,6 +1373,8 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
case AArch64ISD::INSR: return "AArch64ISD::INSR";
case AArch64ISD::PTEST: return "AArch64ISD::PTEST";
case AArch64ISD::PTRUE: return "AArch64ISD::PTRUE";
+ case AArch64ISD::LDNF1: return "AArch64ISD::LDNF1";
+ case AArch64ISD::LDNF1S: return "AArch64ISD::LDNF1S";
case AArch64ISD::GLD1: return "AArch64ISD::GLD1";
case AArch64ISD::GLD1_SCALED: return "AArch64ISD::GLD1_SCALED";
case AArch64ISD::GLD1_SXTW: return "AArch64ISD::GLD1_SXTW";
@@ -10225,9 +10227,14 @@ static SDValue performSVEAndCombine(SDNode *N,
if (!Src.hasOneUse())
return SDValue();
- // GLD1* instructions perform an implicit zero-extend, which makes them
+ EVT MemVT;
+
+ // SVE load instructions perform an implicit zero-extend, which makes them
// perfect candidates for combining.
switch (Src->getOpcode()) {
+ case AArch64ISD::LDNF1:
+ MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT();
+ break;
case AArch64ISD::GLD1:
case AArch64ISD::GLD1_SCALED:
case AArch64ISD::GLD1_SXTW:
@@ -10235,13 +10242,12 @@ static SDValue performSVEAndCombine(SDNode *N,
case AArch64ISD::GLD1_UXTW:
case AArch64ISD::GLD1_UXTW_SCALED:
case AArch64ISD::GLD1_IMM:
+ MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
break;
default:
return SDValue();
}
- EVT MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
-
if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT))
return Src;
@@ -11217,6 +11223,35 @@ static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St,
return NewST1;
}
+// Returns an SVE type that ContentTy can be trivially sign or zero extended
+// into.
+static MVT getSVEContainerType(EVT ContentTy) {
+ assert(ContentTy.isSimple() && "No SVE containers for extended types");
+
+ switch (ContentTy.getSimpleVT().SimpleTy) {
+ default:
+ llvm_unreachable("No known SVE container for this MVT type");
+ case MVT::nxv2i8:
+ case MVT::nxv2i16:
+ case MVT::nxv2i32:
+ case MVT::nxv2i64:
+ case MVT::nxv2f32:
+ case MVT::nxv2f64:
+ return MVT::nxv2i64;
+ case MVT::nxv4i8:
+ case MVT::nxv4i16:
+ case MVT::nxv4i32:
+ case MVT::nxv4f32:
+ return MVT::nxv4i32;
+ case MVT::nxv8i8:
+ case MVT::nxv8i16:
+ case MVT::nxv8f16:
+ return MVT::nxv8i16;
+ case MVT::nxv16i8:
+ return MVT::nxv16i8;
+ }
+}
+
static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG) {
SDLoc DL(N);
EVT VT = N->getValueType(0);
@@ -11259,6 +11294,32 @@ static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG) {
ISD::UNINDEXED, false, false);
}
+static SDValue performLDNF1Combine(SDNode *N, SelectionDAG &DAG) {
+ SDLoc DL(N);
+ EVT VT = N->getValueType(0);
+
+ if (VT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock)
+ return SDValue();
+
+ EVT ContainerVT = VT;
+ if (ContainerVT.isInteger())
+ ContainerVT = getSVEContainerType(ContainerVT);
+
+ SDVTList VTs = DAG.getVTList(ContainerVT, MVT::Other);
+ SDValue Ops[] = { N->getOperand(0), // Chain
+ N->getOperand(2), // Pg
+ N->getOperand(3), // Base
+ DAG.getValueType(VT) };
+
+ SDValue Load = DAG.getNode(AArch64ISD::LDNF1, DL, VTs, Ops);
+ SDValue LoadChain = SDValue(Load.getNode(), 1);
+
+ if (ContainerVT.isInteger() && (VT != ContainerVT))
+ Load = DAG.getNode(ISD::TRUNCATE, DL, VT, Load.getValue(0));
+
+ return DAG.getMergeValues({ Load, LoadChain }, DL);
+}
+
/// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The
/// load store optimizer pass will merge them to store pair stores. This should
/// be better than a movi to create the vector zero followed by a vector store
@@ -12310,29 +12371,6 @@ static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG,
DAG.getConstant(MinOffset, DL, MVT::i64));
}
-// Returns an SVE type that ContentTy can be trivially sign or zero extended
-// into.
-static MVT getSVEContainerType(EVT ContentTy) {
- assert(ContentTy.isSimple() && "No SVE containers for extended types");
-
- switch (ContentTy.getSimpleVT().SimpleTy) {
- default:
- llvm_unreachable("No known SVE container for this MVT type");
- case MVT::nxv2i8:
- case MVT::nxv2i16:
- case MVT::nxv2i32:
- case MVT::nxv2i64:
- case MVT::nxv2f32:
- case MVT::nxv2f64:
- return MVT::nxv2i64;
- case MVT::nxv4i8:
- case MVT::nxv4i16:
- case MVT::nxv4i32:
- case MVT::nxv4f32:
- return MVT::nxv4i32;
- }
-}
-
static SDValue performST1ScatterCombine(SDNode *N, SelectionDAG &DAG,
unsigned Opcode,
bool OnlyPackedOffsets = true) {
@@ -12520,10 +12558,15 @@ performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
SDValue Src = N->getOperand(0);
unsigned Opc = Src->getOpcode();
- // Gather load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
+ // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
// for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
unsigned NewOpc;
+ unsigned MemVTOpNum = 4;
switch (Opc) {
+ case AArch64ISD::LDNF1:
+ NewOpc = AArch64ISD::LDNF1S;
+ MemVTOpNum = 3;
+ break;
case AArch64ISD::GLD1:
NewOpc = AArch64ISD::GLD1S;
break;
@@ -12550,15 +12593,17 @@ performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
}
EVT SignExtSrcVT = cast<VTSDNode>(N->getOperand(1))->getVT();
- EVT GLD1SrcMemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
+ EVT SrcMemVT = cast<VTSDNode>(Src->getOperand(MemVTOpNum))->getVT();
- if ((SignExtSrcVT != GLD1SrcMemVT) || !Src.hasOneUse())
+ if ((SignExtSrcVT != SrcMemVT) || !Src.hasOneUse())
return SDValue();
EVT DstVT = N->getValueType(0);
SDVTList VTs = DAG.getVTList(DstVT, MVT::Other);
- SDValue Ops[] = {Src->getOperand(0), Src->getOperand(1), Src->getOperand(2),
- Src->getOperand(3), Src->getOperand(4)};
+
+ SmallVector<SDValue, 5> Ops;
+ for (unsigned I = 0; I < Src->getNumOperands(); ++I)
+ Ops.push_back(Src->getOperand(I));
SDValue ExtLoad = DAG.getNode(NewOpc, SDLoc(N), VTs, Ops);
DCI.CombineTo(N, ExtLoad);
@@ -12656,6 +12701,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
return performNEONPostLDSTCombine(N, DCI, DAG);
case Intrinsic::aarch64_sve_ldnt1:
return performLDNT1Combine(N, DAG);
+ case Intrinsic::aarch64_sve_ldnf1:
+ return performLDNF1Combine(N, DAG);
case Intrinsic::aarch64_sve_stnt1:
return performSTNT1Combine(N, DAG);
case Intrinsic::aarch64_sve_ld1_gather:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 09a4d324e535..cad8b5b647bf 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -215,6 +215,9 @@ enum NodeType : unsigned {
PTEST,
PTRUE,
+ LDNF1,
+ LDNF1S,
+
// Unsigned gather loads.
GLD1,
GLD1_SCALED,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index c2853da050f1..73243826d472 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -549,6 +549,13 @@ def AArch64stnp : SDNode<"AArch64ISD::STNP", SDT_AArch64stnp, [SDNPHasChain, SDN
def AArch64tbl : SDNode<"AArch64ISD::TBL", SDT_AArch64TBL>;
+def SDT_AArch64_LDNF1 : SDTypeProfile<1, 3, [
+ SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>,
+ SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
+]>;
+
+def AArch64ldnf1 : SDNode<"AArch64ISD::LDNF1", SDT_AArch64_LDNF1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
+
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index bab89e7c5654..70475e459ae8 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -57,6 +57,7 @@ def sve_cnth_imm_neg : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, -8>">;
def sve_cntw_imm_neg : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, -4>">;
def sve_cntd_imm_neg : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, -2>">;
+def AArch64ldnf1s : SDNode<"AArch64ISD::LDNF1S", SDT_AArch64_LDNF1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
def AArch64ld1s_gather : SDNode<"AArch64ISD::GLD1S", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
def AArch64ld1s_gather_scaled : SDNode<"AArch64ISD::GLD1S_SCALED", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
def AArch64ld1s_gather_uxtw : SDNode<"AArch64ISD::GLD1S_UXTW", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
@@ -1259,6 +1260,40 @@ let Predicates = [HasSVE] in {
defm : pred_store<nxv8i16, nxv8i1, non_temporal_store, STNT1H_ZRI>;
defm : pred_store<nxv4i32, nxv4i1, non_temporal_store, STNT1W_ZRI>;
defm : pred_store<nxv2i64, nxv2i1, non_temporal_store, STNT1D_ZRI>;
+
+ multiclass ldnf1<Instruction I, ValueType Ty, SDPatternOperator Load, ValueType PredTy, ValueType MemVT> {
+ // base
+ def : Pat<(Ty (Load (PredTy PPR:$gp), GPR64:$base, MemVT)),
+ (I PPR:$gp, GPR64sp:$base, (i64 0))>;
+ }
+
+ // 2-element contiguous non-faulting loads
+ defm : ldnf1<LDNF1B_D_IMM, nxv2i64, AArch64ldnf1, nxv2i1, nxv2i8>;
+ defm : ldnf1<LDNF1SB_D_IMM, nxv2i64, AArch64ldnf1s, nxv2i1, nxv2i8>;
+ defm : ldnf1<LDNF1H_D_IMM, nxv2i64, AArch64ldnf1, nxv2i1, nxv2i16>;
+ defm : ldnf1<LDNF1SH_D_IMM, nxv2i64, AArch64ldnf1s, nxv2i1, nxv2i16>;
+ defm : ldnf1<LDNF1W_D_IMM, nxv2i64, AArch64ldnf1, nxv2i1, nxv2i32>;
+ defm : ldnf1<LDNF1SW_D_IMM, nxv2i64, AArch64ldnf1s, nxv2i1, nxv2i32>;
+ defm : ldnf1<LDNF1D_IMM, nxv2i64, AArch64ldnf1, nxv2i1, nxv2i64>;
+ defm : ldnf1<LDNF1D_IMM, nxv2f64, AArch64ldnf1, nxv2i1, nxv2f64>;
+
+ // 4-element contiguous non-faulting loads
+ defm : ldnf1<LDNF1B_S_IMM, nxv4i32, AArch64ldnf1, nxv4i1, nxv4i8>;
+ defm : ldnf1<LDNF1SB_S_IMM, nxv4i32, AArch64ldnf1s, nxv4i1, nxv4i8>;
+ defm : ldnf1<LDNF1H_S_IMM, nxv4i32, AArch64ldnf1, nxv4i1, nxv4i16>;
+ defm : ldnf1<LDNF1SH_S_IMM, nxv4i32, AArch64ldnf1s, nxv4i1, nxv4i16>;
+ defm : ldnf1<LDNF1W_IMM, nxv4i32, AArch64ldnf1, nxv4i1, nxv4i32>;
+ defm : ldnf1<LDNF1W_IMM, nxv4f32, AArch64ldnf1, nxv4i1, nxv4f32>;
+
+ // 8-element contiguous non-faulting loads
+ defm : ldnf1<LDNF1B_H_IMM, nxv8i16, AArch64ldnf1, nxv8i1, nxv8i8>;
+ defm : ldnf1<LDNF1SB_H_IMM, nxv8i16, AArch64ldnf1s, nxv8i1, nxv8i8>;
+ defm : ldnf1<LDNF1H_IMM, nxv8i16, AArch64ldnf1, nxv8i1, nxv8i16>;
+ defm : ldnf1<LDNF1H_IMM, nxv8f16, AArch64ldnf1, nxv8i1, nxv8f16>;
+
+ // 16-element contiguous non-faulting loads
+ defm : ldnf1<LDNF1B_IMM, nxv16i8, AArch64ldnf1, nxv16i1, nxv16i8>;
+
}
let Predicates = [HasSVE2] in {
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 717eff34d2e0..83eaef835cfa 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -5557,14 +5557,21 @@ class sve_mem_cld_si_base<bits<4> dtype, bit nf, string asm,
multiclass sve_mem_cld_si_base<bits<4> dtype, bit nf, string asm,
RegisterOperand listty, ZPRRegOp zprty> {
- def "" : sve_mem_cld_si_base<dtype, nf, asm, listty>;
+ def _REAL : sve_mem_cld_si_base<dtype, nf, asm, listty>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
- (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>;
+ (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $imm4, mul vl]",
- (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4), 0>;
+ (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4), 0>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
- (!cast<Instruction>(NAME) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
+ (!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
+
+ // We need a layer of indirection because early machine code passes balk at
+ // physical register (i.e. FFR) uses that have no previous definition.
+ let hasSideEffects = 1, hasNoSchedulingInfo = 1, mayLoad = 1 in {
+ def "" : Pseudo<(outs listty:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4), []>,
+ PseudoInstExpansion<(!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4)>;
+ }
}
multiclass sve_mem_cld_si<bits<4> dtype, string asm, RegisterOperand listty,
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-loads-nf.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-loads-nf.ll
new file mode 100644
index 000000000000..162ade5aca4d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-loads-nf.ll
@@ -0,0 +1,182 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+
+define <vscale x 16 x i8> @ldnf1b(<vscale x 16 x i1> %pg, i8* %a) {
+; CHECK-LABEL: ldnf1b:
+; CHECK: ldnf1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 16 x i8> @llvm.aarch64.sve.ldnf1.nxv16i8(<vscale x 16 x i1> %pg, i8* %a)
+ ret <vscale x 16 x i8> %load
+}
+
+define <vscale x 8 x i16> @ldnf1b_h(<vscale x 8 x i1> %pg, i8* %a) {
+; CHECK-LABEL: ldnf1b_h:
+; CHECK: ldnf1b { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 8 x i8> @llvm.aarch64.sve.ldnf1.nxv8i8(<vscale x 8 x i1> %pg, i8* %a)
+ %res = zext <vscale x 8 x i8> %load to <vscale x 8 x i16>
+ ret <vscale x 8 x i16> %res
+}
+
+define <vscale x 8 x i16> @ldnf1sb_h(<vscale x 8 x i1> %pg, i8* %a) {
+; CHECK-LABEL: ldnf1sb_h:
+; CHECK: ldnf1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 8 x i8> @llvm.aarch64.sve.ldnf1.nxv8i8(<vscale x 8 x i1> %pg, i8* %a)
+ %res = sext <vscale x 8 x i8> %load to <vscale x 8 x i16>
+ ret <vscale x 8 x i16> %res
+}
+
+define <vscale x 8 x i16> @ldnf1h(<vscale x 8 x i1> %pg, i16* %a) {
+; CHECK-LABEL: ldnf1h:
+; CHECK: ldnf1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 8 x i16> @llvm.aarch64.sve.ldnf1.nxv8i16(<vscale x 8 x i1> %pg, i16* %a)
+ ret <vscale x 8 x i16> %load
+}
+
+define <vscale x 8 x half> @ldnf1h_f16(<vscale x 8 x i1> %pg, half* %a) {
+; CHECK-LABEL: ldnf1h_f16:
+; CHECK: ldnf1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 8 x half> @llvm.aarch64.sve.ldnf1.nxv8f16(<vscale x 8 x i1> %pg, half* %a)
+ ret <vscale x 8 x half> %load
+}
+
+define <vscale x 4 x i32> @ldnf1b_s(<vscale x 4 x i1> %pg, i8* %a) {
+; CHECK-LABEL: ldnf1b_s:
+; CHECK: ldnf1b { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ldnf1.nxv4i8(<vscale x 4 x i1> %pg, i8* %a)
+ %res = zext <vscale x 4 x i8> %load to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 4 x i32> @ldnf1sb_s(<vscale x 4 x i1> %pg, i8* %a) {
+; CHECK-LABEL: ldnf1sb_s:
+; CHECK: ldnf1sb { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ldnf1.nxv4i8(<vscale x 4 x i1> %pg, i8* %a)
+ %res = sext <vscale x 4 x i8> %load to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 4 x i32> @ldnf1h_s(<vscale x 4 x i1> %pg, i16* %a) {
+; CHECK-LABEL: ldnf1h_s:
+; CHECK: ldnf1h { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ldnf1.nxv4i16(<vscale x 4 x i1> %pg, i16* %a)
+ %res = zext <vscale x 4 x i16> %load to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 4 x i32> @ldnf1sh_s(<vscale x 4 x i1> %pg, i16* %a) {
+; CHECK-LABEL: ldnf1sh_s:
+; CHECK: ldnf1sh { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ldnf1.nxv4i16(<vscale x 4 x i1> %pg, i16* %a)
+ %res = sext <vscale x 4 x i16> %load to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 4 x i32> @ldnf1w(<vscale x 4 x i1> %pg, i32* %a) {
+; CHECK-LABEL: ldnf1w:
+; CHECK: ldnf1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x i32> @llvm.aarch64.sve.ldnf1.nxv4i32(<vscale x 4 x i1> %pg, i32* %a)
+ ret <vscale x 4 x i32> %load
+}
+
+define <vscale x 4 x float> @ldnf1w_f32(<vscale x 4 x i1> %pg, float* %a) {
+; CHECK-LABEL: ldnf1w_f32:
+; CHECK: ldnf1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x float> @llvm.aarch64.sve.ldnf1.nxv4f32(<vscale x 4 x i1> %pg, float* %a)
+ ret <vscale x 4 x float> %load
+}
+
+define <vscale x 2 x i64> @ldnf1b_d(<vscale x 2 x i1> %pg, i8* %a) {
+; CHECK-LABEL: ldnf1b_d:
+; CHECK: ldnf1b { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ldnf1.nxv2i8(<vscale x 2 x i1> %pg, i8* %a)
+ %res = zext <vscale x 2 x i8> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @ldnf1sb_d(<vscale x 2 x i1> %pg, i8* %a) {
+; CHECK-LABEL: ldnf1sb_d:
+; CHECK: ldnf1sb { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ldnf1.nxv2i8(<vscale x 2 x i1> %pg, i8* %a)
+ %res = sext <vscale x 2 x i8> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @ldnf1h_d(<vscale x 2 x i1> %pg, i16* %a) {
+; CHECK-LABEL: ldnf1h_d:
+; CHECK: ldnf1h { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ldnf1.nxv2i16(<vscale x 2 x i1> %pg, i16* %a)
+ %res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @ldnf1sh_d(<vscale x 2 x i1> %pg, i16* %a) {
+; CHECK-LABEL: ldnf1sh_d:
+; CHECK: ldnf1sh { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ldnf1.nxv2i16(<vscale x 2 x i1> %pg, i16* %a)
+ %res = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @ldnf1w_d(<vscale x 2 x i1> %pg, i32* %a) {
+; CHECK-LABEL: ldnf1w_d:
+; CHECK: ldnf1w { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ldnf1.nxv2i32(<vscale x 2 x i1> %pg, i32* %a)
+ %res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @ldnf1sw_d(<vscale x 2 x i1> %pg, i32* %a) {
+; CHECK-LABEL: ldnf1sw_d:
+; CHECK: ldnf1sw { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ldnf1.nxv2i32(<vscale x 2 x i1> %pg, i32* %a)
+ %res = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @ldnf1d(<vscale x 2 x i1> %pg, i64* %a) {
+; CHECK-LABEL: ldnf1d:
+; CHECK: ldnf1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ldnf1.nxv2i64(<vscale x 2 x i1> %pg, i64* %a)
+ ret <vscale x 2 x i64> %load
+}
+
+define <vscale x 2 x double> @ldnf1d_f64(<vscale x 2 x i1> %pg, double* %a) {
+; CHECK-LABEL: ldnf1d_f64:
+; CHECK: ldnf1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x double> @llvm.aarch64.sve.ldnf1.nxv2f64(<vscale x 2 x i1> %pg, double* %a)
+ ret <vscale x 2 x double> %load
+}
+
+declare <vscale x 16 x i8> @llvm.aarch64.sve.ldnf1.nxv16i8(<vscale x 16 x i1>, i8*)
+
+declare <vscale x 8 x i8> @llvm.aarch64.sve.ldnf1.nxv8i8(<vscale x 8 x i1>, i8*)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.ldnf1.nxv8i16(<vscale x 8 x i1>, i16*)
+declare <vscale x 8 x half> @llvm.aarch64.sve.ldnf1.nxv8f16(<vscale x 8 x i1>, half*)
+
+declare <vscale x 4 x i8> @llvm.aarch64.sve.ldnf1.nxv4i8(<vscale x 4 x i1>, i8*)
+declare <vscale x 4 x i16> @llvm.aarch64.sve.ldnf1.nxv4i16(<vscale x 4 x i1>, i16*)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.ldnf1.nxv4i32(<vscale x 4 x i1>, i32*)
+declare <vscale x 4 x float> @llvm.aarch64.sve.ldnf1.nxv4f32(<vscale x 4 x i1>, float*)
+
+declare <vscale x 2 x i8> @llvm.aarch64.sve.ldnf1.nxv2i8(<vscale x 2 x i1>, i8*)
+declare <vscale x 2 x i16> @llvm.aarch64.sve.ldnf1.nxv2i16(<vscale x 2 x i1>, i16*)
+declare <vscale x 2 x i32> @llvm.aarch64.sve.ldnf1.nxv2i32(<vscale x 2 x i1>, i32*)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.ldnf1.nxv2i64(<vscale x 2 x i1>, i64*)
+declare <vscale x 2 x double> @llvm.aarch64.sve.ldnf1.nxv2f64(<vscale x 2 x i1>, double*)
More information about the llvm-commits
mailing list