[llvm] b82be5d - [AArch64][SVE] Implement structured load intrinsics

Tue Jun 9 01:52:23 PDT 2020

Author: Cullen Rhodes
Date: 2020-06-09T08:51:58Z
New Revision: b82be5db71fbe74f2407c7e38fb5e18fecaf08e4

URL: https://github.com/llvm/llvm-project/commit/b82be5db71fbe74f2407c7e38fb5e18fecaf08e4
DIFF: https://github.com/llvm/llvm-project/commit/b82be5db71fbe74f2407c7e38fb5e18fecaf08e4.diff

LOG: [AArch64][SVE] Implement structured load intrinsics

Summary:
This patch adds initial support for the following instrinsics:

    * llvm.aarch64.sve.ld2
    * llvm.aarch64.sve.ld3
    * llvm.aarch64.sve.ld4

For loading two, three and four vectors worth of data. Basic codegen is
implemented with reg+reg and reg+imm addressing modes being addressed
in a later patch.

The types returned by these intrinsics have a number of elements that is a
multiple of the elements in a 128-bit vector for a given type and N, where N is
the number of vectors being loaded, i.e. 2, 3 or 4. Thus, for 32-bit elements
the types are:

    LD2 : <vscale x 8 x i32>
    LD3 : <vscale x 12 x i32>
    LD4 : <vscale x 16 x i32>

This is implemented with target-specific intrinsics for each variant that take
the same operands as the IR intrinsic but return N values, where the type of
each value is a full vector, i.e. <vscale x 4 x i32> in the above example.
These values are then concatenated using the standard concat_vector intrinsic
to maintain type legality with the IR.

These intrinsics are intended for use in the Arm C Language
Extension (ACLE).

Reviewed By: sdesmalen

Differential Revision: https://reviews.llvm.org/D75751

Added: 
    

Modified: 
    llvm/include/llvm/IR/IntrinsicsAArch64.td
    llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/lib/Target/AArch64/AArch64ISelLowering.h
    llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 22a3c2e6d9bf..d6755a2331c3 100644

--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -814,6 +814,10 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
     : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, llvm_i32_ty],
                 [IntrReadMem, IntrArgMemOnly, ImmArg<ArgIndex<1>>]>;
 
+  class AdvSIMD_ManyVec_PredLoad_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, llvm_anyptr_ty],
+                [IntrReadMem, IntrArgMemOnly]>;
+
   class AdvSIMD_1Vec_PredLoad_Intrinsic
     : Intrinsic<[llvm_anyvector_ty],
                 [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
@@ -1346,6 +1350,10 @@ def int_aarch64_sve_tuple_set : AdvSIMD_SVE_Set_Vector_Tuple;
 
 def int_aarch64_sve_ld1   : AdvSIMD_1Vec_PredLoad_Intrinsic;
 
+def int_aarch64_sve_ld2 : AdvSIMD_ManyVec_PredLoad_Intrinsic;
+def int_aarch64_sve_ld3 : AdvSIMD_ManyVec_PredLoad_Intrinsic;
+def int_aarch64_sve_ld4 : AdvSIMD_ManyVec_PredLoad_Intrinsic;
+
 def int_aarch64_sve_ldnt1 : AdvSIMD_1Vec_PredLoad_Intrinsic;
 def int_aarch64_sve_ldnf1 : AdvSIMD_1Vec_PredLoad_Intrinsic;
 def int_aarch64_sve_ldff1 : AdvSIMD_1Vec_PredLoad_Intrinsic;

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index ed297d338855..3e4dd878120c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -245,6 +245,7 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
                          unsigned SubRegIdx);
   void SelectLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
   void SelectPostLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
+  void SelectPredicatedLoad(SDNode *N, unsigned NumVecs, const unsigned Opc);
 
   bool SelectAddrModeFrameIndexSVE(SDValue N, SDValue &Base, SDValue &OffImm);
   /// SVE Reg+Imm addressing mode.
@@ -1441,6 +1442,30 @@ AArch64DAGToDAGISel::findAddrModeSVELoadStore(SDNode *N, const unsigned Opc_rr,
   return std::make_tuple(IsRegReg ? Opc_rr : Opc_ri, NewBase, NewOffset);
 }
 
+void AArch64DAGToDAGISel::SelectPredicatedLoad(SDNode *N, unsigned NumVecs,
+                                               const unsigned Opc) {
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+  SDValue Chain = N->getOperand(0);
+
+  SDValue Ops[] = {N->getOperand(1), // Predicate
+                   N->getOperand(2), // Memory operand
+                   CurDAG->getTargetConstant(0, DL, MVT::i64), Chain};
+
+  const EVT ResTys[] = {MVT::Untyped, MVT::Other};
+
+  SDNode *Load = CurDAG->getMachineNode(Opc, DL, ResTys, Ops);
+  SDValue SuperReg = SDValue(Load, 0);
+  for (unsigned i = 0; i < NumVecs; ++i)
+    ReplaceUses(SDValue(N, i), CurDAG->getTargetExtractSubreg(
+                                   AArch64::zsub0 + i, DL, VT, SuperReg));
+
+  // Copy chain
+  unsigned ChainIdx = NumVecs;
+  ReplaceUses(SDValue(N, ChainIdx), SDValue(Load, 1));
+  CurDAG->RemoveDeadNode(N);
+}
+
 void AArch64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs,
                                       unsigned Opc) {
   SDLoc dl(N);
@@ -4603,6 +4628,54 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
     }
     break;
   }
+  case AArch64ISD::SVE_LD2: {
+    if (VT == MVT::nxv16i8) {
+      SelectPredicatedLoad(Node, 2, AArch64::LD2B_IMM);
+      return;
+    } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16) {
+      SelectPredicatedLoad(Node, 2, AArch64::LD2H_IMM);
+      return;
+    } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
+      SelectPredicatedLoad(Node, 2, AArch64::LD2W_IMM);
+      return;
+    } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
+      SelectPredicatedLoad(Node, 2, AArch64::LD2D_IMM);
+      return;
+    }
+    break;
+  }
+  case AArch64ISD::SVE_LD3: {
+    if (VT == MVT::nxv16i8) {
+      SelectPredicatedLoad(Node, 3, AArch64::LD3B_IMM);
+      return;
+    } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16) {
+      SelectPredicatedLoad(Node, 3, AArch64::LD3H_IMM);
+      return;
+    } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
+      SelectPredicatedLoad(Node, 3, AArch64::LD3W_IMM);
+      return;
+    } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
+      SelectPredicatedLoad(Node, 3, AArch64::LD3D_IMM);
+      return;
+    }
+    break;
+  }
+  case AArch64ISD::SVE_LD4: {
+    if (VT == MVT::nxv16i8) {
+      SelectPredicatedLoad(Node, 4, AArch64::LD4B_IMM);
+      return;
+    } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16) {
+      SelectPredicatedLoad(Node, 4, AArch64::LD4H_IMM);
+      return;
+    } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
+      SelectPredicatedLoad(Node, 4, AArch64::LD4W_IMM);
+      return;
+    } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
+      SelectPredicatedLoad(Node, 4, AArch64::LD4D_IMM);
+      return;
+    }
+    break;
+  }
   }
 
   // Select the default instruction

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 43a85d510d6a..854c94638877 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1467,6 +1467,9 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case AArch64ISD::LDFF1S:            return "AArch64ISD::LDFF1S";
   case AArch64ISD::LD1RQ:             return "AArch64ISD::LD1RQ";
   case AArch64ISD::LD1RO:             return "AArch64ISD::LD1RO";
+  case AArch64ISD::SVE_LD2:           return "AArch64ISD::SVE_LD2";
+  case AArch64ISD::SVE_LD3:           return "AArch64ISD::SVE_LD3";
+  case AArch64ISD::SVE_LD4:           return "AArch64ISD::SVE_LD4";
   case AArch64ISD::GLD1:              return "AArch64ISD::GLD1";
   case AArch64ISD::GLD1_SCALED:       return "AArch64ISD::GLD1_SCALED";
   case AArch64ISD::GLD1_SXTW:         return "AArch64ISD::GLD1_SXTW";
@@ -9796,6 +9799,56 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
   return true;
 }
 
+// Lower an SVE structured load intrinsic returning a tuple type to target
+// specific intrinsic taking the same input but returning a multi-result value
+// of the split tuple type.
+//
+// E.g. Lowering an LD3:
+//
+//  call <vscale x 12 x i32> @llvm.aarch64.sve.ld3.nxv12i32(
+//                                                    <vscale x 4 x i1> %pred,
+//                                                    <vscale x 4 x i32>* %addr)
+//
+//  Output DAG:
+//
+//    t0: ch = EntryToken
+//        t2: nxv4i1,ch = CopyFromReg t0, Register:nxv4i1 %0
+//        t4: i64,ch = CopyFromReg t0, Register:i64 %1
+//    t5: nxv4i32,nxv4i32,nxv4i32,ch = AArch64ISD::SVE_LD3 t0, t2, t4
+//    t6: nxv12i32 = concat_vectors t5, t5:1, t5:2
+//
+// This is called pre-legalization to avoid widening/splitting issues with
+// non-power-of-2 tuple types used for LD3, such as nxv12i32.
+SDValue AArch64TargetLowering::LowerSVEStructLoad(unsigned Intrinsic,
+                                                  ArrayRef<SDValue> LoadOps,
+                                                  EVT VT, SelectionDAG &DAG,
+                                                  const SDLoc &DL) const {
+  assert(VT.isScalableVector() && "Can only lower scalable vectors");
+
+  unsigned N, Opcode;
+  static std::map<unsigned, std::pair<unsigned, unsigned>> IntrinsicMap = {
+      {Intrinsic::aarch64_sve_ld2, {2, AArch64ISD::SVE_LD2}},
+      {Intrinsic::aarch64_sve_ld3, {3, AArch64ISD::SVE_LD3}},
+      {Intrinsic::aarch64_sve_ld4, {4, AArch64ISD::SVE_LD4}}};
+
+  std::tie(N, Opcode) = IntrinsicMap[Intrinsic];
+  assert(VT.getVectorElementCount().Min % N == 0 &&
+         "invalid tuple vector type!");
+
+  EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
+                                 VT.getVectorElementCount() / N);
+  assert(isTypeLegal(SplitVT));
+
+  SmallVector<EVT, 5> VTs(N, SplitVT);
+  VTs.push_back(MVT::Other); // Chain
+  SDVTList NodeTys = DAG.getVTList(VTs);
+
+  SDValue PseudoLoad = DAG.getNode(Opcode, DL, NodeTys, LoadOps);
+  SmallVector<SDValue, 4> PseudoLoadOps;
+  for (unsigned I = 0; I < N; ++I)
+    PseudoLoadOps.push_back(SDValue(PseudoLoad.getNode(), I));
+  return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, PseudoLoadOps);
+}
 
 EVT AArch64TargetLowering::getOptimalMemOpType(
     const MemOp &Op, const AttributeList &FuncAttributes) const {
@@ -13728,6 +13781,20 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
       SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, DestVT, Opnds);
       return DAG.getMergeValues({Concat, Chain}, DL);
     }
+    case Intrinsic::aarch64_sve_ld2:
+    case Intrinsic::aarch64_sve_ld3:
+    case Intrinsic::aarch64_sve_ld4: {
+      SDLoc DL(N);
+      SDValue Chain = N->getOperand(0);
+      SDValue Mask = N->getOperand(2);
+      SDValue BasePtr = N->getOperand(3);
+      SDValue LoadOps[] = {Chain, Mask, BasePtr};
+      unsigned IntrinsicID =
+          cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+      SDValue Result =
+          LowerSVEStructLoad(IntrinsicID, LoadOps, N->getValueType(0), DAG, DL);
+      return DAG.getMergeValues({Result, Chain}, DL);
+    }
     default:
       break;
     }

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index e42c0b6e05b7..2a68220b6283 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -255,6 +255,11 @@ enum NodeType : unsigned {
   LD1RQ,
   LD1RO,
 
+  // Structured loads.
+  SVE_LD2,
+  SVE_LD3,
+  SVE_LD4,
+
   // Unsigned gather loads.
   GLD1,
   GLD1_SCALED,
@@ -835,6 +840,8 @@ class AArch64TargetLowering : public TargetLowering {
   SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SDValue Chain,
                                          SDValue &Size,
                                          SelectionDAG &DAG) const;
+  SDValue LowerSVEStructLoad(unsigned Intrinsic, ArrayRef<SDValue> LoadOps,
+                             EVT VT, SelectionDAG &DAG, const SDLoc &DL) const;
 
   SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
                         SmallVectorImpl<SDNode *> &Created) const override;

diff  --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll
index 74717d393f55..1244782bd56b 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -asm-verbose=0 < %s | FileCheck %s
 
 ;
 ; LD1RQB
@@ -252,6 +252,244 @@ define <vscale x 2 x double> @ldnt1d_f64(<vscale x 2 x i1> %pred, double* %addr)
   ret <vscale x 2 x double> %res
 }
 
+;
+; LD2B
+;
+
+define <vscale x 32 x i8> @ld2b_i8(<vscale x 16 x i1> %pred, <vscale x 16 x i8>* %addr) {
+; CHECK-LABEL: ld2b_i8:
+; CHECK: ld2b { z0.b, z1.b }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 32 x i8> @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0nxv16i8(<vscale x 16 x i1> %pred,
+                                                                                 <vscale x 16 x i8>* %addr)
+  ret <vscale x 32 x i8> %res
+}
+
+;
+; LD2H
+;
+
+define <vscale x 16 x i16> @ld2h_i16(<vscale x 8 x i1> %pred, <vscale x 8 x i16>* %addr) {
+; CHECK-LABEL: ld2h_i16:
+; CHECK: ld2h { z0.h, z1.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 16 x i16> @llvm.aarch64.sve.ld2.nxv16i16.nxv8i1.p0nxv8i16(<vscale x 8 x i1> %pred,
+                                                                                  <vscale x 8 x i16>* %addr)
+  ret <vscale x 16 x i16> %res
+}
+
+define <vscale x 16 x half> @ld2h_f16(<vscale x 8 x i1> %pred, <vscale x 8 x half>* %addr) {
+; CHECK-LABEL: ld2h_f16:
+; CHECK: ld2h { z0.h, z1.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 16 x half> @llvm.aarch64.sve.ld2.nxv16f16.nxv8i1.p0nxv8f16(<vscale x 8 x i1> %pred,
+                                                                                   <vscale x 8 x half>* %addr)
+  ret <vscale x 16 x half> %res
+}
+
+;
+; LD2W
+;
+
+define <vscale x 8 x i32> @ld2w_i32(<vscale x 4 x i1> %pred, <vscale x 4 x i32>* %addr) {
+; CHECK-LABEL: ld2w_i32:
+; CHECK: ld2w { z0.s, z1.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 8 x i32> @llvm.aarch64.sve.ld2.nxv8i32.nxv4i1.p0nxv4i32(<vscale x 4 x i1> %pred,
+                                                                                <vscale x 4 x i32>* %addr)
+  ret <vscale x 8 x i32> %res
+}
+
+define <vscale x 8 x float> @ld2w_f32(<vscale x 4 x i1> %pred, <vscale x 4 x float>* %addr) {
+; CHECK-LABEL: ld2w_f32:
+; CHECK: ld2w { z0.s, z1.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 8 x float> @llvm.aarch64.sve.ld2.nxv8f32.nxv4i1.p0nxv4f32(<vscale x 4 x i1> %pred,
+                                                                                  <vscale x 4 x float>* %addr)
+  ret <vscale x 8 x float> %res
+}
+
+;
+; LD2D
+;
+
+define <vscale x 4 x i64> @ld2d_i64(<vscale x 2 x i1> %pred, <vscale x 2 x i64>* %addr) {
+; CHECK-LABEL: ld2d_i64:
+; CHECK: ld2d { z0.d, z1.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 4 x i64> @llvm.aarch64.sve.ld2.nxv4i64.nxv2i1.p0nxv2i64(<vscale x 2 x i1> %pred,
+                                                                                <vscale x 2 x i64>* %addr)
+  ret <vscale x 4 x i64> %res
+}
+
+define <vscale x 4 x double> @ld2d_f64(<vscale x 2 x i1> %pred, <vscale x 2 x double>* %addr) {
+; CHECK-LABEL: ld2d_f64:
+; CHECK: ld2d { z0.d, z1.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 4 x double> @llvm.aarch64.sve.ld2.nxv4f64.nxv2i1.p0nxv2f64(<vscale x 2 x i1> %pred,
+                                                                                   <vscale x 2 x double>* %addr)
+  ret <vscale x 4 x double> %res
+}
+
+;
+; LD3B
+;
+
+define <vscale x 48 x i8> @ld3b_i8(<vscale x 16 x i1> %pred, <vscale x 16 x i8>* %addr) {
+; CHECK-LABEL: ld3b_i8:
+; CHECK: ld3b { z0.b, z1.b, z2.b }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 48 x i8> @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0nxv16i8(<vscale x 16 x i1> %pred,
+                                                                                 <vscale x 16 x i8>* %addr)
+  ret <vscale x 48 x i8> %res
+}
+
+;
+; LD3H
+;
+
+define <vscale x 24 x i16> @ld3h_i16(<vscale x 8 x i1> %pred, <vscale x 8 x i16>* %addr) {
+; CHECK-LABEL: ld3h_i16:
+; CHECK: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 24 x i16> @llvm.aarch64.sve.ld3.nxv24i16.nxv8i1.p0nxv8i16(<vscale x 8 x i1> %pred,
+                                                                                  <vscale x 8 x i16>* %addr)
+  ret <vscale x 24 x i16> %res
+}
+
+define <vscale x 24 x half> @ld3h_f16(<vscale x 8 x i1> %pred, <vscale x 8 x half>* %addr) {
+; CHECK-LABEL: ld3h_f16:
+; CHECK: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 24 x half> @llvm.aarch64.sve.ld3.nxv24f16.nxv8i1.p0nxv8f16(<vscale x 8 x i1> %pred,
+                                                                                   <vscale x 8 x half>* %addr)
+  ret <vscale x 24 x half> %res
+}
+
+;
+; LD3W
+;
+
+define <vscale x 12 x i32> @ld3w_i32(<vscale x 4 x i1> %pred, <vscale x 4 x i32>* %addr) {
+; CHECK-LABEL: ld3w_i32:
+; CHECK: ld3w { z0.s, z1.s, z2.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 12 x i32> @llvm.aarch64.sve.ld3.nxv12i32.nxv4i1.p0nxv4i32(<vscale x 4 x i1> %pred,
+                                                                                  <vscale x 4 x i32>* %addr)
+  ret <vscale x 12 x i32> %res
+}
+
+define <vscale x 12 x float> @ld3w_f32(<vscale x 4 x i1> %pred, <vscale x 4 x float>* %addr) {
+; CHECK-LABEL: ld3w_f32:
+; CHECK: ld3w { z0.s, z1.s, z2.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 12 x float> @llvm.aarch64.sve.ld3.nxv12f32.nxv4i1.p0nxv4f32(<vscale x 4 x i1> %pred,
+                                                                                    <vscale x 4 x float>* %addr)
+  ret <vscale x 12 x float> %res
+}
+
+;
+; LD3D
+;
+
+define <vscale x 6 x i64> @ld3d_i64(<vscale x 2 x i1> %pred, <vscale x 2 x i64>* %addr) {
+; CHECK-LABEL: ld3d_i64:
+; CHECK: ld3d { z0.d, z1.d, z2.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 6 x i64> @llvm.aarch64.sve.ld3.nxv6i64.nxv2i1.p0nxv2i64(<vscale x 2 x i1> %pred,
+                                                                                <vscale x 2 x i64>* %addr)
+  ret <vscale x 6 x i64> %res
+}
+
+define <vscale x 6 x double> @ld3d_f64(<vscale x 2 x i1> %pred, <vscale x 2 x double>* %addr) {
+; CHECK-LABEL: ld3d_f64:
+; CHECK: ld3d { z0.d, z1.d, z2.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 6 x double> @llvm.aarch64.sve.ld3.nxv6f64.nxv2i1.p0nxv2f64(<vscale x 2 x i1> %pred,
+                                                                                   <vscale x 2 x double>* %addr)
+  ret <vscale x 6 x double> %res
+}
+
+;
+; LD4B
+;
+
+define <vscale x 64 x i8> @ld4b_i8(<vscale x 16 x i1> %pred, <vscale x 16 x i8>* %addr) {
+; CHECK-LABEL: ld4b_i8:
+; CHECK: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 64 x i8> @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0nxv16i8(<vscale x 16 x i1> %pred,
+                                                                                 <vscale x 16 x i8>* %addr)
+  ret <vscale x 64 x i8> %res
+}
+
+;
+; LD4H
+;
+
+define <vscale x 32 x i16> @ld4h_i16(<vscale x 8 x i1> %pred, <vscale x 8 x i16>* %addr) {
+; CHECK-LABEL: ld4h_i16:
+; CHECK: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 32 x i16> @llvm.aarch64.sve.ld4.nxv32i16.nxv8i1.p0nxv8i16(<vscale x 8 x i1> %pred,
+                                                                                  <vscale x 8 x i16>* %addr)
+  ret <vscale x 32 x i16> %res
+}
+
+define <vscale x 32 x half> @ld4h_f16(<vscale x 8 x i1> %pred, <vscale x 8 x half>* %addr) {
+; CHECK-LABEL: ld4h_f16:
+; CHECK: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 32 x half> @llvm.aarch64.sve.ld4.nxv32f16.nxv8i1.p0nxv8f16(<vscale x 8 x i1> %pred,
+                                                                                   <vscale x 8 x half>* %addr)
+  ret <vscale x 32 x half> %res
+}
+
+;
+; LD4W
+;
+
+define <vscale x 16 x i32> @ld4w_i32(<vscale x 4 x i1> %pred, <vscale x 4 x i32>* %addr) {
+; CHECK-LABEL: ld4w_i32:
+; CHECK: ld4w { z0.s, z1.s, z2.s, z3.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 16 x i32> @llvm.aarch64.sve.ld4.nxv16i32.nxv4i1.p0nxv4i32(<vscale x 4 x i1> %pred,
+                                                                                  <vscale x 4 x i32>* %addr)
+  ret <vscale x 16 x i32> %res
+}
+
+define <vscale x 16 x float> @ld4w_f32(<vscale x 4 x i1> %pred, <vscale x 4 x float>* %addr) {
+; CHECK-LABEL: ld4w_f32:
+; CHECK: ld4w { z0.s, z1.s, z2.s, z3.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 16 x float> @llvm.aarch64.sve.ld4.nxv16f32.nxv4i1.p0nxv4f32(<vscale x 4 x i1> %pred,
+                                                                                    <vscale x 4 x float>* %addr)
+  ret <vscale x 16 x float> %res
+}
+
+;
+; LD4D
+;
+
+define <vscale x 8 x i64> @ld4d_i64(<vscale x 2 x i1> %pred, <vscale x 2 x i64>* %addr) {
+; CHECK-LABEL: ld4d_i64:
+; CHECK: ld4d { z0.d, z1.d, z2.d, z3.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 8 x i64> @llvm.aarch64.sve.ld4.nxv8i64.nxv2i1.p0nxv2i64(<vscale x 2 x i1> %pred,
+                                                                                <vscale x 2 x i64>* %addr)
+  ret <vscale x 8 x i64> %res
+}
+
+define <vscale x 8 x double> @ld4d_f64(<vscale x 2 x i1> %pred, <vscale x 2 x double>* %addr) {
+; CHECK-LABEL: ld4d_f64:
+; CHECK: ld4d { z0.d, z1.d, z2.d, z3.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1.p0nxv2f64(<vscale x 2 x i1> %pred,
+                                                                                   <vscale x 2 x double>* %addr)
+  ret <vscale x 8 x double> %res
+}
+
+
 declare <vscale x 16 x i8> @llvm.aarch64.sve.ld1rq.nxv16i8(<vscale x 16 x i1>, i8*)
 declare <vscale x 8 x i16> @llvm.aarch64.sve.ld1rq.nxv8i16(<vscale x 8 x i1>, i16*)
 declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1rq.nxv4i32(<vscale x 4 x i1>, i32*)
@@ -267,3 +505,27 @@ declare <vscale x 2 x i64> @llvm.aarch64.sve.ldnt1.nxv2i64(<vscale x 2 x i1>, i6
 declare <vscale x 8 x half> @llvm.aarch64.sve.ldnt1.nxv8f16(<vscale x 8 x i1>, half*)
 declare <vscale x 4 x float> @llvm.aarch64.sve.ldnt1.nxv4f32(<vscale x 4 x i1>, float*)
 declare <vscale x 2 x double> @llvm.aarch64.sve.ldnt1.nxv2f64(<vscale x 2 x i1>, double*)
+
+declare <vscale x 32 x i8> @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>*)
+declare <vscale x 16 x i16> @llvm.aarch64.sve.ld2.nxv16i16.nxv8i1.p0nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>*)
+declare <vscale x 8 x i32> @llvm.aarch64.sve.ld2.nxv8i32.nxv4i1.p0nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>*)
+declare <vscale x 4 x i64> @llvm.aarch64.sve.ld2.nxv4i64.nxv2i1.p0nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>*)
+declare <vscale x 16 x half> @llvm.aarch64.sve.ld2.nxv16f16.nxv8i1.p0nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>*)
+declare <vscale x 8 x float> @llvm.aarch64.sve.ld2.nxv8f32.nxv4i1.p0nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>*)
+declare <vscale x 4 x double> @llvm.aarch64.sve.ld2.nxv4f64.nxv2i1.p0nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>*)
+
+declare <vscale x 48 x i8> @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>*)
+declare <vscale x 24 x i16> @llvm.aarch64.sve.ld3.nxv24i16.nxv8i1.p0nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>*)
+declare <vscale x 12 x i32> @llvm.aarch64.sve.ld3.nxv12i32.nxv4i1.p0nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>*)
+declare <vscale x 6 x i64> @llvm.aarch64.sve.ld3.nxv6i64.nxv2i1.p0nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>*)
+declare <vscale x 24 x half> @llvm.aarch64.sve.ld3.nxv24f16.nxv8i1.p0nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>*)
+declare <vscale x 12 x float> @llvm.aarch64.sve.ld3.nxv12f32.nxv4i1.p0nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>*)
+declare <vscale x 6 x double> @llvm.aarch64.sve.ld3.nxv6f64.nxv2i1.p0nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>*)
+
+declare <vscale x 64 x i8> @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>*)
+declare <vscale x 32 x i16> @llvm.aarch64.sve.ld4.nxv32i16.nxv8i1.p0nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>*)
+declare <vscale x 16 x i32> @llvm.aarch64.sve.ld4.nxv16i32.nxv4i1.p0nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>*)
+declare <vscale x 8 x i64> @llvm.aarch64.sve.ld4.nxv8i64.nxv2i1.p0nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>*)
+declare <vscale x 32 x half> @llvm.aarch64.sve.ld4.nxv32f16.nxv8i1.p0nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>*)
+declare <vscale x 16 x float> @llvm.aarch64.sve.ld4.nxv16f32.nxv4i1.p0nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>*)
+declare <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1.p0nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>*)