[llvm] b82be5d - [AArch64][SVE] Implement structured load intrinsics
Cullen Rhodes via llvm-commits
llvm-commits at lists.llvm.org
Tue Jun 9 01:52:23 PDT 2020
Author: Cullen Rhodes
Date: 2020-06-09T08:51:58Z
New Revision: b82be5db71fbe74f2407c7e38fb5e18fecaf08e4
URL: https://github.com/llvm/llvm-project/commit/b82be5db71fbe74f2407c7e38fb5e18fecaf08e4
DIFF: https://github.com/llvm/llvm-project/commit/b82be5db71fbe74f2407c7e38fb5e18fecaf08e4.diff
LOG: [AArch64][SVE] Implement structured load intrinsics
Summary:
This patch adds initial support for the following instrinsics:
* llvm.aarch64.sve.ld2
* llvm.aarch64.sve.ld3
* llvm.aarch64.sve.ld4
For loading two, three and four vectors worth of data. Basic codegen is
implemented with reg+reg and reg+imm addressing modes being addressed
in a later patch.
The types returned by these intrinsics have a number of elements that is a
multiple of the elements in a 128-bit vector for a given type and N, where N is
the number of vectors being loaded, i.e. 2, 3 or 4. Thus, for 32-bit elements
the types are:
LD2 : <vscale x 8 x i32>
LD3 : <vscale x 12 x i32>
LD4 : <vscale x 16 x i32>
This is implemented with target-specific intrinsics for each variant that take
the same operands as the IR intrinsic but return N values, where the type of
each value is a full vector, i.e. <vscale x 4 x i32> in the above example.
These values are then concatenated using the standard concat_vector intrinsic
to maintain type legality with the IR.
These intrinsics are intended for use in the Arm C Language
Extension (ACLE).
Reviewed By: sdesmalen
Differential Revision: https://reviews.llvm.org/D75751
Added:
Modified:
llvm/include/llvm/IR/IntrinsicsAArch64.td
llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/lib/Target/AArch64/AArch64ISelLowering.h
llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 22a3c2e6d9bf..d6755a2331c3 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -814,6 +814,10 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.".
: Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, llvm_i32_ty],
[IntrReadMem, IntrArgMemOnly, ImmArg<ArgIndex<1>>]>;
+ class AdvSIMD_ManyVec_PredLoad_Intrinsic
+ : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, llvm_anyptr_ty],
+ [IntrReadMem, IntrArgMemOnly]>;
+
class AdvSIMD_1Vec_PredLoad_Intrinsic
: Intrinsic<[llvm_anyvector_ty],
[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
@@ -1346,6 +1350,10 @@ def int_aarch64_sve_tuple_set : AdvSIMD_SVE_Set_Vector_Tuple;
def int_aarch64_sve_ld1 : AdvSIMD_1Vec_PredLoad_Intrinsic;
+def int_aarch64_sve_ld2 : AdvSIMD_ManyVec_PredLoad_Intrinsic;
+def int_aarch64_sve_ld3 : AdvSIMD_ManyVec_PredLoad_Intrinsic;
+def int_aarch64_sve_ld4 : AdvSIMD_ManyVec_PredLoad_Intrinsic;
+
def int_aarch64_sve_ldnt1 : AdvSIMD_1Vec_PredLoad_Intrinsic;
def int_aarch64_sve_ldnf1 : AdvSIMD_1Vec_PredLoad_Intrinsic;
def int_aarch64_sve_ldff1 : AdvSIMD_1Vec_PredLoad_Intrinsic;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index ed297d338855..3e4dd878120c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -245,6 +245,7 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
unsigned SubRegIdx);
void SelectLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
void SelectPostLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
+ void SelectPredicatedLoad(SDNode *N, unsigned NumVecs, const unsigned Opc);
bool SelectAddrModeFrameIndexSVE(SDValue N, SDValue &Base, SDValue &OffImm);
/// SVE Reg+Imm addressing mode.
@@ -1441,6 +1442,30 @@ AArch64DAGToDAGISel::findAddrModeSVELoadStore(SDNode *N, const unsigned Opc_rr,
return std::make_tuple(IsRegReg ? Opc_rr : Opc_ri, NewBase, NewOffset);
}
+void AArch64DAGToDAGISel::SelectPredicatedLoad(SDNode *N, unsigned NumVecs,
+ const unsigned Opc) {
+ SDLoc DL(N);
+ EVT VT = N->getValueType(0);
+ SDValue Chain = N->getOperand(0);
+
+ SDValue Ops[] = {N->getOperand(1), // Predicate
+ N->getOperand(2), // Memory operand
+ CurDAG->getTargetConstant(0, DL, MVT::i64), Chain};
+
+ const EVT ResTys[] = {MVT::Untyped, MVT::Other};
+
+ SDNode *Load = CurDAG->getMachineNode(Opc, DL, ResTys, Ops);
+ SDValue SuperReg = SDValue(Load, 0);
+ for (unsigned i = 0; i < NumVecs; ++i)
+ ReplaceUses(SDValue(N, i), CurDAG->getTargetExtractSubreg(
+ AArch64::zsub0 + i, DL, VT, SuperReg));
+
+ // Copy chain
+ unsigned ChainIdx = NumVecs;
+ ReplaceUses(SDValue(N, ChainIdx), SDValue(Load, 1));
+ CurDAG->RemoveDeadNode(N);
+}
+
void AArch64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs,
unsigned Opc) {
SDLoc dl(N);
@@ -4603,6 +4628,54 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
}
break;
}
+ case AArch64ISD::SVE_LD2: {
+ if (VT == MVT::nxv16i8) {
+ SelectPredicatedLoad(Node, 2, AArch64::LD2B_IMM);
+ return;
+ } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16) {
+ SelectPredicatedLoad(Node, 2, AArch64::LD2H_IMM);
+ return;
+ } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
+ SelectPredicatedLoad(Node, 2, AArch64::LD2W_IMM);
+ return;
+ } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
+ SelectPredicatedLoad(Node, 2, AArch64::LD2D_IMM);
+ return;
+ }
+ break;
+ }
+ case AArch64ISD::SVE_LD3: {
+ if (VT == MVT::nxv16i8) {
+ SelectPredicatedLoad(Node, 3, AArch64::LD3B_IMM);
+ return;
+ } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16) {
+ SelectPredicatedLoad(Node, 3, AArch64::LD3H_IMM);
+ return;
+ } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
+ SelectPredicatedLoad(Node, 3, AArch64::LD3W_IMM);
+ return;
+ } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
+ SelectPredicatedLoad(Node, 3, AArch64::LD3D_IMM);
+ return;
+ }
+ break;
+ }
+ case AArch64ISD::SVE_LD4: {
+ if (VT == MVT::nxv16i8) {
+ SelectPredicatedLoad(Node, 4, AArch64::LD4B_IMM);
+ return;
+ } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16) {
+ SelectPredicatedLoad(Node, 4, AArch64::LD4H_IMM);
+ return;
+ } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
+ SelectPredicatedLoad(Node, 4, AArch64::LD4W_IMM);
+ return;
+ } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
+ SelectPredicatedLoad(Node, 4, AArch64::LD4D_IMM);
+ return;
+ }
+ break;
+ }
}
// Select the default instruction
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 43a85d510d6a..854c94638877 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1467,6 +1467,9 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
case AArch64ISD::LDFF1S: return "AArch64ISD::LDFF1S";
case AArch64ISD::LD1RQ: return "AArch64ISD::LD1RQ";
case AArch64ISD::LD1RO: return "AArch64ISD::LD1RO";
+ case AArch64ISD::SVE_LD2: return "AArch64ISD::SVE_LD2";
+ case AArch64ISD::SVE_LD3: return "AArch64ISD::SVE_LD3";
+ case AArch64ISD::SVE_LD4: return "AArch64ISD::SVE_LD4";
case AArch64ISD::GLD1: return "AArch64ISD::GLD1";
case AArch64ISD::GLD1_SCALED: return "AArch64ISD::GLD1_SCALED";
case AArch64ISD::GLD1_SXTW: return "AArch64ISD::GLD1_SXTW";
@@ -9796,6 +9799,56 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
return true;
}
+// Lower an SVE structured load intrinsic returning a tuple type to target
+// specific intrinsic taking the same input but returning a multi-result value
+// of the split tuple type.
+//
+// E.g. Lowering an LD3:
+//
+// call <vscale x 12 x i32> @llvm.aarch64.sve.ld3.nxv12i32(
+// <vscale x 4 x i1> %pred,
+// <vscale x 4 x i32>* %addr)
+//
+// Output DAG:
+//
+// t0: ch = EntryToken
+// t2: nxv4i1,ch = CopyFromReg t0, Register:nxv4i1 %0
+// t4: i64,ch = CopyFromReg t0, Register:i64 %1
+// t5: nxv4i32,nxv4i32,nxv4i32,ch = AArch64ISD::SVE_LD3 t0, t2, t4
+// t6: nxv12i32 = concat_vectors t5, t5:1, t5:2
+//
+// This is called pre-legalization to avoid widening/splitting issues with
+// non-power-of-2 tuple types used for LD3, such as nxv12i32.
+SDValue AArch64TargetLowering::LowerSVEStructLoad(unsigned Intrinsic,
+ ArrayRef<SDValue> LoadOps,
+ EVT VT, SelectionDAG &DAG,
+ const SDLoc &DL) const {
+ assert(VT.isScalableVector() && "Can only lower scalable vectors");
+
+ unsigned N, Opcode;
+ static std::map<unsigned, std::pair<unsigned, unsigned>> IntrinsicMap = {
+ {Intrinsic::aarch64_sve_ld2, {2, AArch64ISD::SVE_LD2}},
+ {Intrinsic::aarch64_sve_ld3, {3, AArch64ISD::SVE_LD3}},
+ {Intrinsic::aarch64_sve_ld4, {4, AArch64ISD::SVE_LD4}}};
+
+ std::tie(N, Opcode) = IntrinsicMap[Intrinsic];
+ assert(VT.getVectorElementCount().Min % N == 0 &&
+ "invalid tuple vector type!");
+
+ EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
+ VT.getVectorElementCount() / N);
+ assert(isTypeLegal(SplitVT));
+
+ SmallVector<EVT, 5> VTs(N, SplitVT);
+ VTs.push_back(MVT::Other); // Chain
+ SDVTList NodeTys = DAG.getVTList(VTs);
+
+ SDValue PseudoLoad = DAG.getNode(Opcode, DL, NodeTys, LoadOps);
+ SmallVector<SDValue, 4> PseudoLoadOps;
+ for (unsigned I = 0; I < N; ++I)
+ PseudoLoadOps.push_back(SDValue(PseudoLoad.getNode(), I));
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, PseudoLoadOps);
+}
EVT AArch64TargetLowering::getOptimalMemOpType(
const MemOp &Op, const AttributeList &FuncAttributes) const {
@@ -13728,6 +13781,20 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, DestVT, Opnds);
return DAG.getMergeValues({Concat, Chain}, DL);
}
+ case Intrinsic::aarch64_sve_ld2:
+ case Intrinsic::aarch64_sve_ld3:
+ case Intrinsic::aarch64_sve_ld4: {
+ SDLoc DL(N);
+ SDValue Chain = N->getOperand(0);
+ SDValue Mask = N->getOperand(2);
+ SDValue BasePtr = N->getOperand(3);
+ SDValue LoadOps[] = {Chain, Mask, BasePtr};
+ unsigned IntrinsicID =
+ cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+ SDValue Result =
+ LowerSVEStructLoad(IntrinsicID, LoadOps, N->getValueType(0), DAG, DL);
+ return DAG.getMergeValues({Result, Chain}, DL);
+ }
default:
break;
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index e42c0b6e05b7..2a68220b6283 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -255,6 +255,11 @@ enum NodeType : unsigned {
LD1RQ,
LD1RO,
+ // Structured loads.
+ SVE_LD2,
+ SVE_LD3,
+ SVE_LD4,
+
// Unsigned gather loads.
GLD1,
GLD1_SCALED,
@@ -835,6 +840,8 @@ class AArch64TargetLowering : public TargetLowering {
SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SDValue Chain,
SDValue &Size,
SelectionDAG &DAG) const;
+ SDValue LowerSVEStructLoad(unsigned Intrinsic, ArrayRef<SDValue> LoadOps,
+ EVT VT, SelectionDAG &DAG, const SDLoc &DL) const;
SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
SmallVectorImpl<SDNode *> &Created) const override;
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll
index 74717d393f55..1244782bd56b 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -asm-verbose=0 < %s | FileCheck %s
;
; LD1RQB
@@ -252,6 +252,244 @@ define <vscale x 2 x double> @ldnt1d_f64(<vscale x 2 x i1> %pred, double* %addr)
ret <vscale x 2 x double> %res
}
+;
+; LD2B
+;
+
+define <vscale x 32 x i8> @ld2b_i8(<vscale x 16 x i1> %pred, <vscale x 16 x i8>* %addr) {
+; CHECK-LABEL: ld2b_i8:
+; CHECK: ld2b { z0.b, z1.b }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %res = call <vscale x 32 x i8> @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0nxv16i8(<vscale x 16 x i1> %pred,
+ <vscale x 16 x i8>* %addr)
+ ret <vscale x 32 x i8> %res
+}
+
+;
+; LD2H
+;
+
+define <vscale x 16 x i16> @ld2h_i16(<vscale x 8 x i1> %pred, <vscale x 8 x i16>* %addr) {
+; CHECK-LABEL: ld2h_i16:
+; CHECK: ld2h { z0.h, z1.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %res = call <vscale x 16 x i16> @llvm.aarch64.sve.ld2.nxv16i16.nxv8i1.p0nxv8i16(<vscale x 8 x i1> %pred,
+ <vscale x 8 x i16>* %addr)
+ ret <vscale x 16 x i16> %res
+}
+
+define <vscale x 16 x half> @ld2h_f16(<vscale x 8 x i1> %pred, <vscale x 8 x half>* %addr) {
+; CHECK-LABEL: ld2h_f16:
+; CHECK: ld2h { z0.h, z1.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %res = call <vscale x 16 x half> @llvm.aarch64.sve.ld2.nxv16f16.nxv8i1.p0nxv8f16(<vscale x 8 x i1> %pred,
+ <vscale x 8 x half>* %addr)
+ ret <vscale x 16 x half> %res
+}
+
+;
+; LD2W
+;
+
+define <vscale x 8 x i32> @ld2w_i32(<vscale x 4 x i1> %pred, <vscale x 4 x i32>* %addr) {
+; CHECK-LABEL: ld2w_i32:
+; CHECK: ld2w { z0.s, z1.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x i32> @llvm.aarch64.sve.ld2.nxv8i32.nxv4i1.p0nxv4i32(<vscale x 4 x i1> %pred,
+ <vscale x 4 x i32>* %addr)
+ ret <vscale x 8 x i32> %res
+}
+
+define <vscale x 8 x float> @ld2w_f32(<vscale x 4 x i1> %pred, <vscale x 4 x float>* %addr) {
+; CHECK-LABEL: ld2w_f32:
+; CHECK: ld2w { z0.s, z1.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x float> @llvm.aarch64.sve.ld2.nxv8f32.nxv4i1.p0nxv4f32(<vscale x 4 x i1> %pred,
+ <vscale x 4 x float>* %addr)
+ ret <vscale x 8 x float> %res
+}
+
+;
+; LD2D
+;
+
+define <vscale x 4 x i64> @ld2d_i64(<vscale x 2 x i1> %pred, <vscale x 2 x i64>* %addr) {
+; CHECK-LABEL: ld2d_i64:
+; CHECK: ld2d { z0.d, z1.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %res = call <vscale x 4 x i64> @llvm.aarch64.sve.ld2.nxv4i64.nxv2i1.p0nxv2i64(<vscale x 2 x i1> %pred,
+ <vscale x 2 x i64>* %addr)
+ ret <vscale x 4 x i64> %res
+}
+
+define <vscale x 4 x double> @ld2d_f64(<vscale x 2 x i1> %pred, <vscale x 2 x double>* %addr) {
+; CHECK-LABEL: ld2d_f64:
+; CHECK: ld2d { z0.d, z1.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %res = call <vscale x 4 x double> @llvm.aarch64.sve.ld2.nxv4f64.nxv2i1.p0nxv2f64(<vscale x 2 x i1> %pred,
+ <vscale x 2 x double>* %addr)
+ ret <vscale x 4 x double> %res
+}
+
+;
+; LD3B
+;
+
+define <vscale x 48 x i8> @ld3b_i8(<vscale x 16 x i1> %pred, <vscale x 16 x i8>* %addr) {
+; CHECK-LABEL: ld3b_i8:
+; CHECK: ld3b { z0.b, z1.b, z2.b }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %res = call <vscale x 48 x i8> @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0nxv16i8(<vscale x 16 x i1> %pred,
+ <vscale x 16 x i8>* %addr)
+ ret <vscale x 48 x i8> %res
+}
+
+;
+; LD3H
+;
+
+define <vscale x 24 x i16> @ld3h_i16(<vscale x 8 x i1> %pred, <vscale x 8 x i16>* %addr) {
+; CHECK-LABEL: ld3h_i16:
+; CHECK: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %res = call <vscale x 24 x i16> @llvm.aarch64.sve.ld3.nxv24i16.nxv8i1.p0nxv8i16(<vscale x 8 x i1> %pred,
+ <vscale x 8 x i16>* %addr)
+ ret <vscale x 24 x i16> %res
+}
+
+define <vscale x 24 x half> @ld3h_f16(<vscale x 8 x i1> %pred, <vscale x 8 x half>* %addr) {
+; CHECK-LABEL: ld3h_f16:
+; CHECK: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %res = call <vscale x 24 x half> @llvm.aarch64.sve.ld3.nxv24f16.nxv8i1.p0nxv8f16(<vscale x 8 x i1> %pred,
+ <vscale x 8 x half>* %addr)
+ ret <vscale x 24 x half> %res
+}
+
+;
+; LD3W
+;
+
+define <vscale x 12 x i32> @ld3w_i32(<vscale x 4 x i1> %pred, <vscale x 4 x i32>* %addr) {
+; CHECK-LABEL: ld3w_i32:
+; CHECK: ld3w { z0.s, z1.s, z2.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %res = call <vscale x 12 x i32> @llvm.aarch64.sve.ld3.nxv12i32.nxv4i1.p0nxv4i32(<vscale x 4 x i1> %pred,
+ <vscale x 4 x i32>* %addr)
+ ret <vscale x 12 x i32> %res
+}
+
+define <vscale x 12 x float> @ld3w_f32(<vscale x 4 x i1> %pred, <vscale x 4 x float>* %addr) {
+; CHECK-LABEL: ld3w_f32:
+; CHECK: ld3w { z0.s, z1.s, z2.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %res = call <vscale x 12 x float> @llvm.aarch64.sve.ld3.nxv12f32.nxv4i1.p0nxv4f32(<vscale x 4 x i1> %pred,
+ <vscale x 4 x float>* %addr)
+ ret <vscale x 12 x float> %res
+}
+
+;
+; LD3D
+;
+
+define <vscale x 6 x i64> @ld3d_i64(<vscale x 2 x i1> %pred, <vscale x 2 x i64>* %addr) {
+; CHECK-LABEL: ld3d_i64:
+; CHECK: ld3d { z0.d, z1.d, z2.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %res = call <vscale x 6 x i64> @llvm.aarch64.sve.ld3.nxv6i64.nxv2i1.p0nxv2i64(<vscale x 2 x i1> %pred,
+ <vscale x 2 x i64>* %addr)
+ ret <vscale x 6 x i64> %res
+}
+
+define <vscale x 6 x double> @ld3d_f64(<vscale x 2 x i1> %pred, <vscale x 2 x double>* %addr) {
+; CHECK-LABEL: ld3d_f64:
+; CHECK: ld3d { z0.d, z1.d, z2.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %res = call <vscale x 6 x double> @llvm.aarch64.sve.ld3.nxv6f64.nxv2i1.p0nxv2f64(<vscale x 2 x i1> %pred,
+ <vscale x 2 x double>* %addr)
+ ret <vscale x 6 x double> %res
+}
+
+;
+; LD4B
+;
+
+define <vscale x 64 x i8> @ld4b_i8(<vscale x 16 x i1> %pred, <vscale x 16 x i8>* %addr) {
+; CHECK-LABEL: ld4b_i8:
+; CHECK: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %res = call <vscale x 64 x i8> @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0nxv16i8(<vscale x 16 x i1> %pred,
+ <vscale x 16 x i8>* %addr)
+ ret <vscale x 64 x i8> %res
+}
+
+;
+; LD4H
+;
+
+define <vscale x 32 x i16> @ld4h_i16(<vscale x 8 x i1> %pred, <vscale x 8 x i16>* %addr) {
+; CHECK-LABEL: ld4h_i16:
+; CHECK: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %res = call <vscale x 32 x i16> @llvm.aarch64.sve.ld4.nxv32i16.nxv8i1.p0nxv8i16(<vscale x 8 x i1> %pred,
+ <vscale x 8 x i16>* %addr)
+ ret <vscale x 32 x i16> %res
+}
+
+define <vscale x 32 x half> @ld4h_f16(<vscale x 8 x i1> %pred, <vscale x 8 x half>* %addr) {
+; CHECK-LABEL: ld4h_f16:
+; CHECK: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %res = call <vscale x 32 x half> @llvm.aarch64.sve.ld4.nxv32f16.nxv8i1.p0nxv8f16(<vscale x 8 x i1> %pred,
+ <vscale x 8 x half>* %addr)
+ ret <vscale x 32 x half> %res
+}
+
+;
+; LD4W
+;
+
+define <vscale x 16 x i32> @ld4w_i32(<vscale x 4 x i1> %pred, <vscale x 4 x i32>* %addr) {
+; CHECK-LABEL: ld4w_i32:
+; CHECK: ld4w { z0.s, z1.s, z2.s, z3.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %res = call <vscale x 16 x i32> @llvm.aarch64.sve.ld4.nxv16i32.nxv4i1.p0nxv4i32(<vscale x 4 x i1> %pred,
+ <vscale x 4 x i32>* %addr)
+ ret <vscale x 16 x i32> %res
+}
+
+define <vscale x 16 x float> @ld4w_f32(<vscale x 4 x i1> %pred, <vscale x 4 x float>* %addr) {
+; CHECK-LABEL: ld4w_f32:
+; CHECK: ld4w { z0.s, z1.s, z2.s, z3.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %res = call <vscale x 16 x float> @llvm.aarch64.sve.ld4.nxv16f32.nxv4i1.p0nxv4f32(<vscale x 4 x i1> %pred,
+ <vscale x 4 x float>* %addr)
+ ret <vscale x 16 x float> %res
+}
+
+;
+; LD4D
+;
+
+define <vscale x 8 x i64> @ld4d_i64(<vscale x 2 x i1> %pred, <vscale x 2 x i64>* %addr) {
+; CHECK-LABEL: ld4d_i64:
+; CHECK: ld4d { z0.d, z1.d, z2.d, z3.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x i64> @llvm.aarch64.sve.ld4.nxv8i64.nxv2i1.p0nxv2i64(<vscale x 2 x i1> %pred,
+ <vscale x 2 x i64>* %addr)
+ ret <vscale x 8 x i64> %res
+}
+
+define <vscale x 8 x double> @ld4d_f64(<vscale x 2 x i1> %pred, <vscale x 2 x double>* %addr) {
+; CHECK-LABEL: ld4d_f64:
+; CHECK: ld4d { z0.d, z1.d, z2.d, z3.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1.p0nxv2f64(<vscale x 2 x i1> %pred,
+ <vscale x 2 x double>* %addr)
+ ret <vscale x 8 x double> %res
+}
+
+
declare <vscale x 16 x i8> @llvm.aarch64.sve.ld1rq.nxv16i8(<vscale x 16 x i1>, i8*)
declare <vscale x 8 x i16> @llvm.aarch64.sve.ld1rq.nxv8i16(<vscale x 8 x i1>, i16*)
declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1rq.nxv4i32(<vscale x 4 x i1>, i32*)
@@ -267,3 +505,27 @@ declare <vscale x 2 x i64> @llvm.aarch64.sve.ldnt1.nxv2i64(<vscale x 2 x i1>, i6
declare <vscale x 8 x half> @llvm.aarch64.sve.ldnt1.nxv8f16(<vscale x 8 x i1>, half*)
declare <vscale x 4 x float> @llvm.aarch64.sve.ldnt1.nxv4f32(<vscale x 4 x i1>, float*)
declare <vscale x 2 x double> @llvm.aarch64.sve.ldnt1.nxv2f64(<vscale x 2 x i1>, double*)
+
+declare <vscale x 32 x i8> @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>*)
+declare <vscale x 16 x i16> @llvm.aarch64.sve.ld2.nxv16i16.nxv8i1.p0nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>*)
+declare <vscale x 8 x i32> @llvm.aarch64.sve.ld2.nxv8i32.nxv4i1.p0nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>*)
+declare <vscale x 4 x i64> @llvm.aarch64.sve.ld2.nxv4i64.nxv2i1.p0nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>*)
+declare <vscale x 16 x half> @llvm.aarch64.sve.ld2.nxv16f16.nxv8i1.p0nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>*)
+declare <vscale x 8 x float> @llvm.aarch64.sve.ld2.nxv8f32.nxv4i1.p0nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>*)
+declare <vscale x 4 x double> @llvm.aarch64.sve.ld2.nxv4f64.nxv2i1.p0nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>*)
+
+declare <vscale x 48 x i8> @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>*)
+declare <vscale x 24 x i16> @llvm.aarch64.sve.ld3.nxv24i16.nxv8i1.p0nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>*)
+declare <vscale x 12 x i32> @llvm.aarch64.sve.ld3.nxv12i32.nxv4i1.p0nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>*)
+declare <vscale x 6 x i64> @llvm.aarch64.sve.ld3.nxv6i64.nxv2i1.p0nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>*)
+declare <vscale x 24 x half> @llvm.aarch64.sve.ld3.nxv24f16.nxv8i1.p0nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>*)
+declare <vscale x 12 x float> @llvm.aarch64.sve.ld3.nxv12f32.nxv4i1.p0nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>*)
+declare <vscale x 6 x double> @llvm.aarch64.sve.ld3.nxv6f64.nxv2i1.p0nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>*)
+
+declare <vscale x 64 x i8> @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>*)
+declare <vscale x 32 x i16> @llvm.aarch64.sve.ld4.nxv32i16.nxv8i1.p0nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>*)
+declare <vscale x 16 x i32> @llvm.aarch64.sve.ld4.nxv16i32.nxv4i1.p0nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>*)
+declare <vscale x 8 x i64> @llvm.aarch64.sve.ld4.nxv8i64.nxv2i1.p0nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>*)
+declare <vscale x 32 x half> @llvm.aarch64.sve.ld4.nxv32f16.nxv8i1.p0nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>*)
+declare <vscale x 16 x float> @llvm.aarch64.sve.ld4.nxv16f32.nxv4i1.p0nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>*)
+declare <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1.p0nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>*)
More information about the llvm-commits
mailing list