[llvm] 387c49f - [AArch64][SME2/SVE2p1] Add predicate-as-counter intrinsics for ld1/ldnt1/st1/stnt1
Sander de Smalen via llvm-commits
llvm-commits at lists.llvm.org
Mon May 22 06:51:39 PDT 2023
Author: Sander de Smalen
Date: 2023-05-22T13:51:02Z
New Revision: 387c49f693c82bdf8b9b0f1ef48a92f51bb781b4
URL: https://github.com/llvm/llvm-project/commit/387c49f693c82bdf8b9b0f1ef48a92f51bb781b4
DIFF: https://github.com/llvm/llvm-project/commit/387c49f693c82bdf8b9b0f1ef48a92f51bb781b4.diff
LOG: [AArch64][SME2/SVE2p1] Add predicate-as-counter intrinsics for ld1/ldnt1/st1/stnt1
These intrinsics are used to implement multi-vector load/store intrinsics that loads
or stores a tuple of 2 or 4 values, based on a predicate-as-counter operand, e.g.
__attribute__((arm_streaming))
svuint8x2_t svld1[_u8]_x2(svcount_t png, const uint8_t *rn);
__attribute__((arm_streaming))
void svst1[_u8_x2](svcount_t png, uint8_t *rn, svuint8x2_t zt);
As described in https://github.com/ARM-software/acle/pull/217
Reviewed By: CarolineConcatto
Differential Revision: https://reviews.llvm.org/D150956
Added:
llvm/test/CodeGen/AArch64/sve2p1-intrinsics-loads.ll
llvm/test/CodeGen/AArch64/sve2p1-intrinsics-stores.ll
Modified:
llvm/include/llvm/IR/IntrinsicsAArch64.td
llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
Removed:
################################################################################
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index d0e354bb962c..2c3082c26049 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -2600,6 +2600,46 @@ def int_aarch64_sve_bfmmla : SVE_4Vec_BF16;
def int_aarch64_sve_bfdot_lane_v2 : SVE_4Vec_BF16_Indexed;
def int_aarch64_sve_bfmlalb_lane_v2 : SVE_4Vec_BF16_Indexed;
def int_aarch64_sve_bfmlalt_lane_v2 : SVE_4Vec_BF16_Indexed;
+
+//
+// SVE2.1 - Contiguous loads to multiple consecutive vectors
+//
+
+ class SVE2p1_Load_PN_X2_Intrinsic
+ : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
+ [llvm_aarch64_svcount_ty, llvm_ptr_ty],
+ [IntrReadMem, IntrArgMemOnly]>;
+
+ class SVE2p1_Load_PN_X4_Intrinsic
+ : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
+ LLVMMatchType<0>, LLVMMatchType<0>],
+ [llvm_aarch64_svcount_ty, llvm_ptr_ty],
+ [IntrReadMem, IntrArgMemOnly]>;
+
+def int_aarch64_sve_ld1_pn_x2 : SVE2p1_Load_PN_X2_Intrinsic;
+def int_aarch64_sve_ld1_pn_x4 : SVE2p1_Load_PN_X4_Intrinsic;
+def int_aarch64_sve_ldnt1_pn_x2 : SVE2p1_Load_PN_X2_Intrinsic;
+def int_aarch64_sve_ldnt1_pn_x4 : SVE2p1_Load_PN_X4_Intrinsic;
+
+//
+// SVE2.1 - Contiguous stores to multiple consecutive vectors
+//
+
+ class SVE2p1_Store_PN_X2_Intrinsic
+ : DefaultAttrsIntrinsic<[], [ llvm_anyvector_ty, LLVMMatchType<0>,
+ llvm_aarch64_svcount_ty, llvm_ptr_ty ],
+ [IntrWriteMem, IntrArgMemOnly]>;
+
+ class SVE2p1_Store_PN_X4_Intrinsic
+ : DefaultAttrsIntrinsic<[], [ llvm_anyvector_ty, LLVMMatchType<0>,
+ LLVMMatchType<0>, LLVMMatchType<0>,
+ llvm_aarch64_svcount_ty, llvm_ptr_ty],
+ [IntrWriteMem, IntrArgMemOnly]>;
+
+def int_aarch64_sve_st1_pn_x2 : SVE2p1_Store_PN_X2_Intrinsic;
+def int_aarch64_sve_st1_pn_x4 : SVE2p1_Store_PN_X4_Intrinsic;
+def int_aarch64_sve_stnt1_pn_x2 : SVE2p1_Store_PN_X2_Intrinsic;
+def int_aarch64_sve_stnt1_pn_x4 : SVE2p1_Store_PN_X4_Intrinsic;
}
//
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index e83a386983bb..7bb7480b8a2a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -370,6 +370,9 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
void SelectPredicatedLoad(SDNode *N, unsigned NumVecs, unsigned Scale,
unsigned Opc_rr, unsigned Opc_ri,
bool IsIntr = false);
+ void SelectContiguousMultiVectorLoad(SDNode *N, unsigned NumVecs,
+ unsigned Scale, unsigned Opc_rr,
+ unsigned Opc_ri);
void SelectDestructiveMultiIntrinsic(SDNode *N, unsigned NumVecs,
bool IsZmMulti, unsigned Opcode,
bool HasPred = false);
@@ -1779,6 +1782,39 @@ void AArch64DAGToDAGISel::SelectPredicatedLoad(SDNode *N, unsigned NumVecs,
CurDAG->RemoveDeadNode(N);
}
+void AArch64DAGToDAGISel::SelectContiguousMultiVectorLoad(SDNode *N,
+ unsigned NumVecs,
+ unsigned Scale,
+ unsigned Opc_ri,
+ unsigned Opc_rr) {
+ assert(Scale < 4 && "Invalid scaling value.");
+ SDLoc DL(N);
+ EVT VT = N->getValueType(0);
+ SDValue Chain = N->getOperand(0);
+
+ // Use simplest addressing mode for now - base + 0 offset
+ SDValue PNg = N->getOperand(2);
+ SDValue Base = N->getOperand(3);
+ SDValue Offset = CurDAG->getTargetConstant(0, DL, MVT::i64);
+
+ SDValue Ops[] = {PNg, // Predicate-as-counter
+ Base, // Memory operand
+ Offset, Chain};
+
+ const EVT ResTys[] = {MVT::Untyped, MVT::Other};
+
+ SDNode *Load = CurDAG->getMachineNode(Opc_ri, DL, ResTys, Ops);
+ SDValue SuperReg = SDValue(Load, 0);
+ for (unsigned i = 0; i < NumVecs; ++i)
+ ReplaceUses(SDValue(N, i), CurDAG->getTargetExtractSubreg(
+ AArch64::zsub0 + i, DL, VT, SuperReg));
+
+ // Copy chain
+ unsigned ChainIdx = NumVecs;
+ ReplaceUses(SDValue(N, ChainIdx), SDValue(Load, 1));
+ CurDAG->RemoveDeadNode(N);
+}
+
void AArch64DAGToDAGISel::SelectFrintFromVT(SDNode *N, unsigned NumVecs,
unsigned Opcode) {
if (N->getValueType(0) != MVT::nxv4f32)
@@ -4655,6 +4691,74 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
}
break;
}
+ case Intrinsic::aarch64_sve_ld1_pn_x2: {
+ if (VT == MVT::nxv16i8) {
+ SelectContiguousMultiVectorLoad(Node, 2, 0, AArch64::LD1B_2Z_IMM, AArch64::LD1B_2Z);
+ return;
+ } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
+ VT == MVT::nxv8bf16) {
+ SelectContiguousMultiVectorLoad(Node, 2, 1, AArch64::LD1H_2Z_IMM, AArch64::LD1H_2Z);
+ return;
+ } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
+ SelectContiguousMultiVectorLoad(Node, 2, 2, AArch64::LD1W_2Z_IMM, AArch64::LD1W_2Z);
+ return;
+ } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
+ SelectContiguousMultiVectorLoad(Node, 2, 3, AArch64::LD1D_2Z_IMM, AArch64::LD1D_2Z);
+ return;
+ }
+ break;
+ }
+ case Intrinsic::aarch64_sve_ld1_pn_x4: {
+ if (VT == MVT::nxv16i8) {
+ SelectContiguousMultiVectorLoad(Node, 4, 0, AArch64::LD1B_4Z_IMM, AArch64::LD1B_4Z);
+ return;
+ } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
+ VT == MVT::nxv8bf16) {
+ SelectContiguousMultiVectorLoad(Node, 4, 1, AArch64::LD1H_4Z_IMM, AArch64::LD1H_4Z);
+ return;
+ } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
+ SelectContiguousMultiVectorLoad(Node, 4, 2, AArch64::LD1W_4Z_IMM, AArch64::LD1W_4Z);
+ return;
+ } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
+ SelectContiguousMultiVectorLoad(Node, 4, 3, AArch64::LD1D_4Z_IMM, AArch64::LD1D_4Z);
+ return;
+ }
+ break;
+ }
+ case Intrinsic::aarch64_sve_ldnt1_pn_x2: {
+ if (VT == MVT::nxv16i8) {
+ SelectContiguousMultiVectorLoad(Node, 2, 0, AArch64::LDNT1B_2Z_IMM, AArch64::LDNT1B_2Z);
+ return;
+ } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
+ VT == MVT::nxv8bf16) {
+ SelectContiguousMultiVectorLoad(Node, 2, 1, AArch64::LDNT1H_2Z_IMM, AArch64::LDNT1H_2Z);
+ return;
+ } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
+ SelectContiguousMultiVectorLoad(Node, 2, 2, AArch64::LDNT1W_2Z_IMM, AArch64::LDNT1W_2Z);
+ return;
+ } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
+ SelectContiguousMultiVectorLoad(Node, 2, 3, AArch64::LDNT1D_2Z_IMM, AArch64::LDNT1D_2Z);
+ return;
+ }
+ break;
+ }
+ case Intrinsic::aarch64_sve_ldnt1_pn_x4: {
+ if (VT == MVT::nxv16i8) {
+ SelectContiguousMultiVectorLoad(Node, 4, 0, AArch64::LDNT1B_4Z_IMM, AArch64::LDNT1B_4Z);
+ return;
+ } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
+ VT == MVT::nxv8bf16) {
+ SelectContiguousMultiVectorLoad(Node, 4, 1, AArch64::LDNT1H_4Z_IMM, AArch64::LDNT1H_4Z);
+ return;
+ } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
+ SelectContiguousMultiVectorLoad(Node, 4, 2, AArch64::LDNT1W_4Z_IMM, AArch64::LDNT1W_4Z);
+ return;
+ } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
+ SelectContiguousMultiVectorLoad(Node, 4, 3, AArch64::LDNT1D_4Z_IMM, AArch64::LDNT1D_4Z);
+ return;
+ }
+ break;
+ }
case Intrinsic::aarch64_sve_ld3_sret: {
if (VT == MVT::nxv16i8) {
SelectPredicatedLoad(Node, 3, 0, AArch64::LD3B_IMM, AArch64::LD3B,
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index b93a9b5fa546..634332316d24 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -3837,6 +3837,59 @@ defm STNT1H_4Z_IMM : sve2p1_mem_cst_si_4z<"stnt1h", 0b01, 0b1, ZZZZ_h_mul_r>;
defm STNT1W_4Z_IMM : sve2p1_mem_cst_si_4z<"stnt1w", 0b10, 0b1, ZZZZ_s_mul_r>;
defm STNT1D_4Z_IMM : sve2p1_mem_cst_si_4z<"stnt1d", 0b11, 0b1, ZZZZ_d_mul_r>;
+multiclass store_pn_x2<ValueType Ty, SDPatternOperator Store,
+ Instruction RegImmInst> {
+ def : Pat<(Store (Ty ZPR:$vec0), (Ty ZPR:$vec1),
+ (aarch64svcount PPR:$PNg), GPR64:$base),
+ (RegImmInst (REG_SEQUENCE ZPR2Mul2, Ty:$vec0, zsub0, Ty:$vec1, zsub1),
+ PPR:$PNg, GPR64:$base, (i64 0))>;
+}
+
+// Stores of 2 consecutive vectors
+defm : store_pn_x2<nxv16i8, int_aarch64_sve_st1_pn_x2, ST1B_2Z_IMM>;
+defm : store_pn_x2<nxv8i16, int_aarch64_sve_st1_pn_x2, ST1H_2Z_IMM>;
+defm : store_pn_x2<nxv4i32, int_aarch64_sve_st1_pn_x2, ST1W_2Z_IMM>;
+defm : store_pn_x2<nxv2i64, int_aarch64_sve_st1_pn_x2, ST1D_2Z_IMM>;
+defm : store_pn_x2<nxv16i8, int_aarch64_sve_stnt1_pn_x2, STNT1B_2Z_IMM>;
+defm : store_pn_x2<nxv8i16, int_aarch64_sve_stnt1_pn_x2, STNT1H_2Z_IMM>;
+defm : store_pn_x2<nxv4i32, int_aarch64_sve_stnt1_pn_x2, STNT1W_2Z_IMM>;
+defm : store_pn_x2<nxv2i64, int_aarch64_sve_stnt1_pn_x2, STNT1D_2Z_IMM>;
+defm : store_pn_x2<nxv8f16, int_aarch64_sve_st1_pn_x2, ST1H_2Z_IMM>;
+defm : store_pn_x2<nxv8bf16, int_aarch64_sve_st1_pn_x2, ST1H_2Z_IMM>;
+defm : store_pn_x2<nxv4f32, int_aarch64_sve_st1_pn_x2, ST1W_2Z_IMM>;
+defm : store_pn_x2<nxv2f64, int_aarch64_sve_st1_pn_x2, ST1D_2Z_IMM>;
+defm : store_pn_x2<nxv8f16, int_aarch64_sve_stnt1_pn_x2, STNT1H_2Z_IMM>;
+defm : store_pn_x2<nxv8bf16, int_aarch64_sve_stnt1_pn_x2, STNT1H_2Z_IMM>;
+defm : store_pn_x2<nxv4f32, int_aarch64_sve_stnt1_pn_x2, STNT1W_2Z_IMM>;
+defm : store_pn_x2<nxv2f64, int_aarch64_sve_stnt1_pn_x2, STNT1D_2Z_IMM>;
+
+multiclass store_pn_x4<ValueType Ty, SDPatternOperator Store,
+ Instruction RegImmInst> {
+ def : Pat<(Store (Ty ZPR:$vec0), (Ty ZPR:$vec1), (Ty ZPR:$vec2), (Ty ZPR:$vec3),
+ (aarch64svcount PPR:$PNg), GPR64:$base),
+ (RegImmInst (REG_SEQUENCE ZPR4Mul4, Ty:$vec0, zsub0, Ty:$vec1, zsub1,
+ Ty:$vec2, zsub2, Ty:$vec3, zsub3),
+ PPR:$PNg, GPR64:$base, (i64 0))>;
+}
+
+// Stores of 4 consecutive vectors
+defm : store_pn_x4<nxv16i8, int_aarch64_sve_st1_pn_x4, ST1B_4Z_IMM>;
+defm : store_pn_x4<nxv8i16, int_aarch64_sve_st1_pn_x4, ST1H_4Z_IMM>;
+defm : store_pn_x4<nxv4i32, int_aarch64_sve_st1_pn_x4, ST1W_4Z_IMM>;
+defm : store_pn_x4<nxv2i64, int_aarch64_sve_st1_pn_x4, ST1D_4Z_IMM>;
+defm : store_pn_x4<nxv16i8, int_aarch64_sve_stnt1_pn_x4, STNT1B_4Z_IMM>;
+defm : store_pn_x4<nxv8i16, int_aarch64_sve_stnt1_pn_x4, STNT1H_4Z_IMM>;
+defm : store_pn_x4<nxv4i32, int_aarch64_sve_stnt1_pn_x4, STNT1W_4Z_IMM>;
+defm : store_pn_x4<nxv2i64, int_aarch64_sve_stnt1_pn_x4, STNT1D_4Z_IMM>;
+defm : store_pn_x4<nxv8f16, int_aarch64_sve_st1_pn_x4, ST1H_4Z_IMM>;
+defm : store_pn_x4<nxv8bf16, int_aarch64_sve_st1_pn_x4, ST1H_4Z_IMM>;
+defm : store_pn_x4<nxv4f32, int_aarch64_sve_st1_pn_x4, ST1W_4Z_IMM>;
+defm : store_pn_x4<nxv2f64, int_aarch64_sve_st1_pn_x4, ST1D_4Z_IMM>;
+defm : store_pn_x4<nxv8f16, int_aarch64_sve_stnt1_pn_x4, STNT1H_4Z_IMM>;
+defm : store_pn_x4<nxv8bf16, int_aarch64_sve_stnt1_pn_x4, STNT1H_4Z_IMM>;
+defm : store_pn_x4<nxv4f32, int_aarch64_sve_stnt1_pn_x4, STNT1W_4Z_IMM>;
+defm : store_pn_x4<nxv2f64, int_aarch64_sve_stnt1_pn_x4, STNT1D_4Z_IMM>;
+
defm WHILEGE_2PXX : sve2p1_int_while_rr_pair<"whilege", 0b000>;
defm WHILEGT_2PXX : sve2p1_int_while_rr_pair<"whilegt", 0b001>;
defm WHILELT_2PXX : sve2p1_int_while_rr_pair<"whilelt", 0b010>;
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-loads.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-loads.ll
new file mode 100644
index 000000000000..79d7e6723231
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-loads.ll
@@ -0,0 +1,648 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1,+bf16 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2,+bf16 < %s | FileCheck %s
+
+; == Normal Multi-Vector Consecutive Loads ==
+
+define { <vscale x 16 x i8>, <vscale x 16 x i8> } @ld1_x2_i8(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ld1_x2_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: ld1b { z0.b, z1.b }, pn8/z, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %pn, ptr %ptr);
+ ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res
+}
+
+define { <vscale x 8 x i16>, <vscale x 8 x i16> } @ld1_x2_i16(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ld1_x2_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: ld1h { z0.h, z1.h }, pn8/z, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %res = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %pn, ptr %ptr);
+ ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %res
+}
+
+define { <vscale x 4 x i32>, <vscale x 4 x i32> } @ld1_x2_i32(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ld1_x2_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: ld1w { z0.s, z1.s }, pn8/z, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %res = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld1.pn.x2.nxv4i32(target("aarch64.svcount") %pn, ptr %ptr);
+ ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %res
+}
+
+define { <vscale x 2 x i64>, <vscale x 2 x i64> } @ld1_x2_i64(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ld1_x2_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: ld1d { z0.d, z1.d }, pn8/z, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %res = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld1.pn.x2.nxv2i64(target("aarch64.svcount") %pn, ptr %ptr);
+ ret { <vscale x 2 x i64>, <vscale x 2 x i64> } %res
+}
+
+define { <vscale x 8 x half>, <vscale x 8 x half> } @ld1_x2_f16(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ld1_x2_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: ld1h { z0.h, z1.h }, pn8/z, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld1.pn.x2.nxv8f16(target("aarch64.svcount") %pn, ptr %ptr);
+ ret { <vscale x 8 x half>, <vscale x 8 x half> } %res
+}
+
+define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @ld1_x2_bf16(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ld1_x2_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: ld1h { z0.h, z1.h }, pn8/z, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld1.pn.x2.nxv8bf16(target("aarch64.svcount") %pn, ptr %ptr);
+ ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
+}
+
+define { <vscale x 4 x float>, <vscale x 4 x float> } @ld1_x2_f32(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ld1_x2_f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: ld1w { z0.s, z1.s }, pn8/z, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x2.nxv4f32(target("aarch64.svcount") %pn, ptr %ptr);
+ ret { <vscale x 4 x float>, <vscale x 4 x float> } %res
+}
+
+define { <vscale x 2 x double>, <vscale x 2 x double> } @ld1_x2_f64(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ld1_x2_f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: ld1d { z0.d, z1.d }, pn8/z, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %res = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x2.nxv2f64(target("aarch64.svcount") %pn, ptr %ptr);
+ ret { <vscale x 2 x double>, <vscale x 2 x double> } %res
+}
+
+; Test to ensure we load into the correct registers for the instruction
+define <vscale x 16 x i8> @ld1_x2_i8_z0_taken(target("aarch64.svcount") %pn, ptr %ptr, <vscale x 16 x i8> %val) {
+; CHECK-LABEL: ld1_x2_i8_z0_taken:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: ld1b { z2.b, z3.b }, pn8/z, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: add z0.b, z0.b, z2.b
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %ld1 = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %pn, ptr %ptr);
+ %ld1_0 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %ld1, 0
+ %res = add <vscale x 16 x i8> %val, %ld1_0
+ ret <vscale x 16 x i8> %res
+}
+
+define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @ld1_x4_i8(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ld1_x4_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: ld1b { z0.b - z3.b }, pn8/z, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %pn, ptr %ptr);
+ ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
+}
+
+define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @ld1_x4_i16(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ld1_x4_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: ld1h { z0.h - z3.h }, pn8/z, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") %pn, ptr %ptr);
+ ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
+}
+
+define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @ld1_x4_i32(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ld1_x4_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: ld1w { z0.s - z3.s }, pn8/z, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld1.pn.x4.nxv4i32(target("aarch64.svcount") %pn, ptr %ptr);
+ ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
+}
+
+define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @ld1_x4_i64(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ld1_x4_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: ld1d { z0.d - z3.d }, pn8/z, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld1.pn.x4.nxv2i64(target("aarch64.svcount") %pn, ptr %ptr);
+ ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res
+}
+
+define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @ld1_x4_f16(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ld1_x4_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: ld1h { z0.h - z3.h }, pn8/z, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %res = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld1.pn.x4.nxv8f16(target("aarch64.svcount") %pn, ptr %ptr);
+ ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res
+}
+
+define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @ld1_x4_bf16(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ld1_x4_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: ld1h { z0.h - z3.h }, pn8/z, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld1.pn.x4.nxv8bf16(target("aarch64.svcount") %pn, ptr %ptr);
+ ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
+}
+
+define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @ld1_x4_f32(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ld1_x4_f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: ld1w { z0.s - z3.s }, pn8/z, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %res = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount") %pn, ptr %ptr);
+ ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res
+}
+
+define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @ld1_x4_f64(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ld1_x4_f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: ld1d { z0.d - z3.d }, pn8/z, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %res = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x4.nxv2f64(target("aarch64.svcount") %pn, ptr %ptr);
+ ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %res
+}
+
+; Test to ensure we load into the correct registers for the instruction
+define <vscale x 8 x i16> @ld1_x4_i16_z0_taken(target("aarch64.svcount") %pn, ptr %ptr, <vscale x 8 x i16> %val) {
+; CHECK-LABEL: ld1_x4_i16_z0_taken:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: ld1h { z4.h - z7.h }, pn8/z, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: add z0.h, z0.h, z4.h
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %ld1 = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") %pn, ptr %ptr);
+ %ld1_0 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %ld1, 0
+ %res = add <vscale x 8 x i16> %val, %ld1_0
+ ret <vscale x 8 x i16> %res
+}
+
+
+; == Non-temporal Multi-Vector Consecutive Loads ==
+
+define { <vscale x 16 x i8>, <vscale x 16 x i8> } @ldnt1_x2_i8(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ldnt1_x2_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: ldnt1b { z0.b, z1.b }, pn8/z, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv16i8(target("aarch64.svcount") %pn, ptr %ptr);
+ ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res
+}
+
+define { <vscale x 8 x i16>, <vscale x 8 x i16> } @ldnt1_x2_i16(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ldnt1_x2_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: ldnt1h { z0.h, z1.h }, pn8/z, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %res = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8i16(target("aarch64.svcount") %pn, ptr %ptr);
+ ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %res
+}
+
+define { <vscale x 4 x i32>, <vscale x 4 x i32> } @ldnt1_x2_i32(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ldnt1_x2_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: ldnt1w { z0.s, z1.s }, pn8/z, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %res = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4i32(target("aarch64.svcount") %pn, ptr %ptr);
+ ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %res
+}
+
+define { <vscale x 2 x i64>, <vscale x 2 x i64> } @ldnt1_x2_i64(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ldnt1_x2_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: ldnt1d { z0.d, z1.d }, pn8/z, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %res = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv2i64(target("aarch64.svcount") %pn, ptr %ptr);
+ ret { <vscale x 2 x i64>, <vscale x 2 x i64> } %res
+}
+
+define { <vscale x 8 x half>, <vscale x 8 x half> } @ldnt1_x2_f16(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ldnt1_x2_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: ldnt1h { z0.h, z1.h }, pn8/z, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8f16(target("aarch64.svcount") %pn, ptr %ptr);
+ ret { <vscale x 8 x half>, <vscale x 8 x half> } %res
+}
+
+define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @ldnt1_x2_bf16(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ldnt1_x2_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: ldnt1h { z0.h, z1.h }, pn8/z, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8bf16(target("aarch64.svcount") %pn, ptr %ptr);
+ ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
+}
+
+define { <vscale x 4 x float>, <vscale x 4 x float> } @ldnt1_x2_f32(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ldnt1_x2_f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: ldnt1w { z0.s, z1.s }, pn8/z, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4f32(target("aarch64.svcount") %pn, ptr %ptr);
+ ret { <vscale x 4 x float>, <vscale x 4 x float> } %res
+}
+
+define { <vscale x 2 x double>, <vscale x 2 x double> } @ldnt1_x2_f64(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ldnt1_x2_f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: ldnt1d { z0.d, z1.d }, pn8/z, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %res = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv2f64(target("aarch64.svcount") %pn, ptr %ptr);
+ ret { <vscale x 2 x double>, <vscale x 2 x double> } %res
+}
+
+; Test to ensure we load into the correct registers for the instruction
+define <vscale x 4 x i32> @ldnt1_x2_i32_z0_taken(target("aarch64.svcount") %pn, ptr %ptr, <vscale x 4 x i32> %val) {
+; CHECK-LABEL: ldnt1_x2_i32_z0_taken:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: ldnt1w { z2.s, z3.s }, pn8/z, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: add z0.s, z0.s, z2.s
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %ld1 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4i32(target("aarch64.svcount") %pn, ptr %ptr);
+ %ld1_0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %ld1, 0
+ %res = add <vscale x 4 x i32> %val, %ld1_0
+ ret <vscale x 4 x i32> %res
+}
+
+define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @ldnt1_x4_i8(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ldnt1_x4_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: ldnt1b { z0.b - z3.b }, pn8/z, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv16i8(target("aarch64.svcount") %pn, ptr %ptr);
+ ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
+}
+
+define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @ldnt1_x4_i16(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ldnt1_x4_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: ldnt1h { z0.h - z3.h }, pn8/z, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8i16(target("aarch64.svcount") %pn, ptr %ptr);
+ ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
+}
+
+define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @ldnt1_x4_i32(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ldnt1_x4_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: ldnt1w { z0.s - z3.s }, pn8/z, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4i32(target("aarch64.svcount") %pn, ptr %ptr);
+ ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
+}
+
+define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @ldnt1_x4_i64(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ldnt1_x4_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: ldnt1d { z0.d - z3.d }, pn8/z, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2i64(target("aarch64.svcount") %pn, ptr %ptr);
+ ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res
+}
+
+define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @ldnt1_x4_f16(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ldnt1_x4_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: ldnt1h { z0.h - z3.h }, pn8/z, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %res = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8f16(target("aarch64.svcount") %pn, ptr %ptr);
+ ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res
+}
+
+define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @ldnt1_x4_bf16(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ldnt1_x4_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: ldnt1h { z0.h - z3.h }, pn8/z, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8bf16(target("aarch64.svcount") %pn, ptr %ptr);
+ ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
+}
+
+define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @ldnt1_x4_f32(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ldnt1_x4_f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: ldnt1w { z0.s - z3.s }, pn8/z, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %res = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4f32(target("aarch64.svcount") %pn, ptr %ptr);
+ ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res
+}
+
+define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @ldnt1_x4_f64(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ldnt1_x4_f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: ldnt1d { z0.d - z3.d }, pn8/z, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %res = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2f64(target("aarch64.svcount") %pn, ptr %ptr);
+ ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %res
+}
+
+; Test to ensure we load into the correct registers for the instruction
+define <vscale x 2 x i64> @ldnt1_x4_i64_z0_taken(target("aarch64.svcount") %pn, ptr %ptr, <vscale x 2 x i64> %val) {
+; CHECK-LABEL: ldnt1_x4_i64_z0_taken:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: ldnt1d { z4.d - z7.d }, pn8/z, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: add z0.d, z0.d, z4.d
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %ld1 = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2i64(target("aarch64.svcount") %pn, ptr %ptr);
+ %ld1_0 = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %ld1, 0
+ %res = add <vscale x 2 x i64> %val, %ld1_0
+ ret <vscale x 2 x i64> %res
+}
+
+declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld1.pn.x2.nxv2i64(target("aarch64.svcount"), ptr)
+declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld1.pn.x2.nxv4i32(target("aarch64.svcount"), ptr)
+declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount"), ptr)
+declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount"), ptr)
+declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x2.nxv2f64(target("aarch64.svcount"), ptr)
+declare { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x2.nxv4f32(target("aarch64.svcount"), ptr)
+declare { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld1.pn.x2.nxv8f16(target("aarch64.svcount"), ptr)
+declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld1.pn.x2.nxv8bf16(target("aarch64.svcount"), ptr)
+
+declare { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld1.pn.x4.nxv2i64(target("aarch64.svcount"), ptr)
+declare { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld1.pn.x4.nxv4i32(target("aarch64.svcount"), ptr)
+declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount"), ptr)
+declare { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount"), ptr)
+declare { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x4.nxv2f64(target("aarch64.svcount"), ptr)
+declare { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount"), ptr)
+declare { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld1.pn.x4.nxv8f16(target("aarch64.svcount"), ptr)
+declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld1.pn.x4.nxv8bf16(target("aarch64.svcount"), ptr)
+
+declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv2i64(target("aarch64.svcount"), ptr)
+declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4i32(target("aarch64.svcount"), ptr)
+declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8i16(target("aarch64.svcount"), ptr)
+declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv16i8(target("aarch64.svcount"), ptr)
+declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv2f64(target("aarch64.svcount"), ptr)
+declare { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4f32(target("aarch64.svcount"), ptr)
+declare { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8f16(target("aarch64.svcount"), ptr)
+declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8bf16(target("aarch64.svcount"), ptr)
+
+declare { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2i64(target("aarch64.svcount"), ptr)
+declare { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4i32(target("aarch64.svcount"), ptr)
+declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8i16(target("aarch64.svcount"), ptr)
+declare { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv16i8(target("aarch64.svcount"), ptr)
+declare { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2f64(target("aarch64.svcount"), ptr)
+declare { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4f32(target("aarch64.svcount"), ptr)
+declare { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8f16(target("aarch64.svcount"), ptr)
+declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8bf16(target("aarch64.svcount"), ptr)
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-stores.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-stores.ll
new file mode 100644
index 000000000000..eb3d199a31e2
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-stores.ll
@@ -0,0 +1,650 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1,+bf16 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2,+bf16 < %s | FileCheck %s
+
+; == Normal Multi-Vector Consecutive Stores ==
+
+define void @st1_x2_i8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: st1_x2_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: mov z3.d, z2.d
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: mov z2.d, z1.d
+; CHECK-NEXT: st1b { z2.b, z3.b }, pn8, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.st1.pn.x2.nxv16i8(<vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, target("aarch64.svcount") %pn, ptr %ptr);
+ ret void
+}
+
+define void @st1_x2_i16(<vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: st1_x2_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: mov z3.d, z2.d
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: mov z2.d, z1.d
+; CHECK-NEXT: st1h { z2.h, z3.h }, pn8, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.st1.pn.x2.nxv8i16(<vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, target("aarch64.svcount") %pn, ptr %ptr);
+ ret void
+}
+
+define void @st1_x2_i32(<vscale x 16 x i8> %unused, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: st1_x2_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: mov z3.d, z2.d
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: mov z2.d, z1.d
+; CHECK-NEXT: st1w { z2.s, z3.s }, pn8, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.st1.pn.x2.nxv4i32(<vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, target("aarch64.svcount") %pn, ptr %ptr);
+ ret void
+}
+
+define void @st1_x2_i64(<vscale x 16 x i8> %unused, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: st1_x2_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: mov z3.d, z2.d
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: mov z2.d, z1.d
+; CHECK-NEXT: st1d { z2.d, z3.d }, pn8, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.st1.pn.x2.nxv2i64(<vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, target("aarch64.svcount") %pn, ptr %ptr);
+ ret void
+}
+
+define void @st1_x2_f16(<vscale x 16 x i8> %unused, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: st1_x2_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: mov z3.d, z2.d
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: mov z2.d, z1.d
+; CHECK-NEXT: st1h { z2.h, z3.h }, pn8, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.st1.pn.x2.nxv8f16(<vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, target("aarch64.svcount") %pn, ptr %ptr);
+ ret void
+}
+
+define void @st1_x2_bf16(<vscale x 16 x i8> %unused, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: st1_x2_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: mov z3.d, z2.d
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: mov z2.d, z1.d
+; CHECK-NEXT: st1h { z2.h, z3.h }, pn8, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.st1.pn.x2.nxv8bf16(<vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, target("aarch64.svcount") %pn, ptr %ptr);
+ ret void
+}
+
+define void @st1_x2_f32(<vscale x 16 x i8> %unused, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: st1_x2_f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: mov z3.d, z2.d
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: mov z2.d, z1.d
+; CHECK-NEXT: st1w { z2.s, z3.s }, pn8, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.st1.pn.x2.nxv4f32(<vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, target("aarch64.svcount") %pn, ptr %ptr);
+ ret void
+}
+
+define void @st1_x2_f64(<vscale x 16 x i8> %unused, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: st1_x2_f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: mov z3.d, z2.d
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: mov z2.d, z1.d
+; CHECK-NEXT: st1d { z2.d, z3.d }, pn8, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.st1.pn.x2.nxv2f64(<vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, target("aarch64.svcount") %pn, ptr %ptr);
+ ret void
+}
+
+define void @st1_x4_i8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: st1_x4_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: mov z7.d, z4.d
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov z6.d, z3.d
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: mov z5.d, z2.d
+; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: st1b { z4.b - z7.b }, pn8, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.st1.pn.x4.nxv16i8(<vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, target("aarch64.svcount") %pn, ptr %ptr);
+ ret void
+}
+
+define void @st1_x4_i16(<vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: st1_x4_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: mov z7.d, z4.d
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov z6.d, z3.d
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: mov z5.d, z2.d
+; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: st1h { z4.h - z7.h }, pn8, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.st1.pn.x4.nxv8i16(<vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, target("aarch64.svcount") %pn, ptr %ptr);
+ ret void
+}
+
+define void @st1_x4_i32(<vscale x 16 x i8> %unused, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: st1_x4_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: mov z7.d, z4.d
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov z6.d, z3.d
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: mov z5.d, z2.d
+; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: st1w { z4.s - z7.s }, pn8, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.st1.pn.x4.nxv4i32(<vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3, target("aarch64.svcount") %pn, ptr %ptr);
+ ret void
+}
+
+define void @st1_x4_i64(<vscale x 16 x i8> %unused, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: st1_x4_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: mov z7.d, z4.d
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov z6.d, z3.d
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: mov z5.d, z2.d
+; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: st1d { z4.d - z7.d }, pn8, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.st1.pn.x4.nxv2i64(<vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3, target("aarch64.svcount") %pn, ptr %ptr);
+ ret void
+}
+
+define void @st1_x4_f16(<vscale x 16 x i8> %unused, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: st1_x4_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: mov z7.d, z4.d
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov z6.d, z3.d
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: mov z5.d, z2.d
+; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: st1h { z4.h - z7.h }, pn8, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.st1.pn.x4.nxv8f16(<vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, target("aarch64.svcount") %pn, ptr %ptr);
+ ret void
+}
+
+define void @st1_x4_bf16(<vscale x 16 x i8> %unused, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: st1_x4_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: mov z7.d, z4.d
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov z6.d, z3.d
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: mov z5.d, z2.d
+; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: st1h { z4.h - z7.h }, pn8, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.st1.pn.x4.nxv8bf16(<vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, target("aarch64.svcount") %pn, ptr %ptr);
+ ret void
+}
+
+define void @st1_x4_f32(<vscale x 16 x i8> %unused, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: st1_x4_f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: mov z7.d, z4.d
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov z6.d, z3.d
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: mov z5.d, z2.d
+; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: st1w { z4.s - z7.s }, pn8, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.st1.pn.x4.nxv4f32(<vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, target("aarch64.svcount") %pn, ptr %ptr);
+ ret void
+}
+
+define void @st1_x4_f64(<vscale x 16 x i8> %unused, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: st1_x4_f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: mov z7.d, z4.d
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov z6.d, z3.d
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: mov z5.d, z2.d
+; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: st1d { z4.d - z7.d }, pn8, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.st1.pn.x4.nxv2f64(<vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3, target("aarch64.svcount") %pn, ptr %ptr);
+ ret void
+}
+
+; == Non-temporal Multi-Vector Consecutive Stores ==
+
+define void @stnt1_x2_i8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: stnt1_x2_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: mov z3.d, z2.d
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: mov z2.d, z1.d
+; CHECK-NEXT: stnt1b { z2.b, z3.b }, pn8, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.stnt1.pn.x2.nxv16i8(<vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, target("aarch64.svcount") %pn, ptr %ptr);
+ ret void
+}
+
+define void @stnt1_x2_i16(<vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: stnt1_x2_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: mov z3.d, z2.d
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: mov z2.d, z1.d
+; CHECK-NEXT: stnt1h { z2.h, z3.h }, pn8, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.stnt1.pn.x2.nxv8i16(<vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, target("aarch64.svcount") %pn, ptr %ptr);
+ ret void
+}
+
+define void @stnt1_x2_i32(<vscale x 16 x i8> %unused, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: stnt1_x2_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: mov z3.d, z2.d
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: mov z2.d, z1.d
+; CHECK-NEXT: stnt1w { z2.s, z3.s }, pn8, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.stnt1.pn.x2.nxv4i32(<vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, target("aarch64.svcount") %pn, ptr %ptr);
+ ret void
+}
+
+define void @stnt1_x2_i64(<vscale x 16 x i8> %unused, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: stnt1_x2_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: mov z3.d, z2.d
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: mov z2.d, z1.d
+; CHECK-NEXT: stnt1d { z2.d, z3.d }, pn8, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.stnt1.pn.x2.nxv2i64(<vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, target("aarch64.svcount") %pn, ptr %ptr);
+ ret void
+}
+
+define void @stnt1_x2_f16(<vscale x 16 x i8> %unused, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: stnt1_x2_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: mov z3.d, z2.d
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: mov z2.d, z1.d
+; CHECK-NEXT: stnt1h { z2.h, z3.h }, pn8, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.stnt1.pn.x2.nxv8f16(<vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, target("aarch64.svcount") %pn, ptr %ptr);
+ ret void
+}
+
+define void @stnt1_x2_bf16(<vscale x 16 x i8> %unused, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: stnt1_x2_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: mov z3.d, z2.d
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: mov z2.d, z1.d
+; CHECK-NEXT: stnt1h { z2.h, z3.h }, pn8, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.stnt1.pn.x2.nxv8bf16(<vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, target("aarch64.svcount") %pn, ptr %ptr);
+ ret void
+}
+
+define void @stnt1_x2_f32(<vscale x 16 x i8> %unused, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: stnt1_x2_f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: mov z3.d, z2.d
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: mov z2.d, z1.d
+; CHECK-NEXT: stnt1w { z2.s, z3.s }, pn8, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.stnt1.pn.x2.nxv4f32(<vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, target("aarch64.svcount") %pn, ptr %ptr);
+ ret void
+}
+
+define void @stnt1_x2_f64(<vscale x 16 x i8> %unused, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: stnt1_x2_f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: mov z3.d, z2.d
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: mov z2.d, z1.d
+; CHECK-NEXT: stnt1d { z2.d, z3.d }, pn8, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.stnt1.pn.x2.nxv2f64(<vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, target("aarch64.svcount") %pn, ptr %ptr);
+ ret void
+}
+
+define void @stnt1_x4_i8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: stnt1_x4_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: mov z7.d, z4.d
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov z6.d, z3.d
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: mov z5.d, z2.d
+; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: stnt1b { z4.b - z7.b }, pn8, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.stnt1.pn.x4.nxv16i8(<vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, target("aarch64.svcount") %pn, ptr %ptr);
+ ret void
+}
+
+define void @stnt1_x4_i16(<vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: stnt1_x4_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: mov z7.d, z4.d
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov z6.d, z3.d
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: mov z5.d, z2.d
+; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: stnt1h { z4.h - z7.h }, pn8, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.stnt1.pn.x4.nxv8i16(<vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, target("aarch64.svcount") %pn, ptr %ptr);
+ ret void
+}
+
+define void @stnt1_x4_i32(<vscale x 16 x i8> %unused, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: stnt1_x4_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: mov z7.d, z4.d
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov z6.d, z3.d
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: mov z5.d, z2.d
+; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: stnt1w { z4.s - z7.s }, pn8, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.stnt1.pn.x4.nxv4i32(<vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3, target("aarch64.svcount") %pn, ptr %ptr);
+ ret void
+}
+
+define void @stnt1_x4_i64(<vscale x 16 x i8> %unused, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: stnt1_x4_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: mov z7.d, z4.d
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov z6.d, z3.d
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: mov z5.d, z2.d
+; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: stnt1d { z4.d - z7.d }, pn8, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.stnt1.pn.x4.nxv2i64(<vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3, target("aarch64.svcount") %pn, ptr %ptr);
+ ret void
+}
+
+define void @stnt1_x4_f16(<vscale x 16 x i8> %unused, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: stnt1_x4_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: mov z7.d, z4.d
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov z6.d, z3.d
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: mov z5.d, z2.d
+; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: stnt1h { z4.h - z7.h }, pn8, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.stnt1.pn.x4.nxv8f16(<vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, target("aarch64.svcount") %pn, ptr %ptr);
+ ret void
+}
+
+define void @stnt1_x4_bf16(<vscale x 16 x i8> %unused, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: stnt1_x4_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: mov z7.d, z4.d
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov z6.d, z3.d
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: mov z5.d, z2.d
+; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: stnt1h { z4.h - z7.h }, pn8, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.stnt1.pn.x4.nxv8bf16(<vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, target("aarch64.svcount") %pn, ptr %ptr);
+ ret void
+}
+
+define void @stnt1_x4_f32(<vscale x 16 x i8> %unused, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: stnt1_x4_f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: mov z7.d, z4.d
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov z6.d, z3.d
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: mov z5.d, z2.d
+; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: stnt1w { z4.s - z7.s }, pn8, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.stnt1.pn.x4.nxv4f32(<vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, target("aarch64.svcount") %pn, ptr %ptr);
+ ret void
+}
+
+define void @stnt1_x4_f64(<vscale x 16 x i8> %unused, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: stnt1_x4_f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: mov z7.d, z4.d
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov z6.d, z3.d
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: mov z5.d, z2.d
+; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: stnt1d { z4.d - z7.d }, pn8, [x0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.stnt1.pn.x4.nxv2f64(<vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3, target("aarch64.svcount") %pn, ptr %ptr);
+ ret void
+}
+
+declare void @llvm.aarch64.sve.st1.pn.x2.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.st1.pn.x2.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.st1.pn.x2.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.st1.pn.x2.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.st1.pn.x2.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.st1.pn.x2.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.st1.pn.x2.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.st1.pn.x2.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.stnt1.pn.x2.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.stnt1.pn.x2.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.stnt1.pn.x2.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.stnt1.pn.x2.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.stnt1.pn.x2.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.stnt1.pn.x2.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.stnt1.pn.x2.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.stnt1.pn.x2.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, target("aarch64.svcount"), ptr)
+
+
+declare void @llvm.aarch64.sve.st1.pn.x4.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.st1.pn.x4.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.st1.pn.x4.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.st1.pn.x4.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.st1.pn.x4.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.st1.pn.x4.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.st1.pn.x4.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.st1.pn.x4.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.stnt1.pn.x4.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.stnt1.pn.x4.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.stnt1.pn.x4.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.stnt1.pn.x4.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.stnt1.pn.x4.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.stnt1.pn.x4.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.stnt1.pn.x4.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.stnt1.pn.x4.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, target("aarch64.svcount"), ptr)
More information about the llvm-commits
mailing list